## Import Libraries

In [6]:
import pandas as pd
import numpy as np
import re
import itertools
from string import punctuation
import nltk
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
ps = nltk.PorterStemmer()
wordnet = nltk.WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words('english')

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_selection import SelectPercentile, SelectFpr, SelectFdr, chi2, mutual_info_classif

import matplotlib.pyplot as plt
import seaborn as sns

category = ['Sydney', 'Melbourne', 'Brisbane', 'Perth']

## Prepping raw tweet data

In [2]:
train = pd.read_csv('train-raw.tsv', sep='\t', names=['label', 'body_text']).iloc[1:].reset_index(drop=True)

dev = pd.read_csv('dev-raw.tsv', sep='\t').reset_index()
dev.drop('level_0', axis=1, inplace=True)
dev.columns = ['label', 'body_text']

test = pd.read_csv('test-raw.tsv', sep='\t').reset_index().rename({'level_0': 'Id'},axis=1).set_index('Id').drop('level_1',axis=1)
test.columns = ['body_text']

In [3]:
X_train, y_train, X_dev, y_dev = train['body_text'], train['label'], dev['body_text'], dev['label']
# dev meta is used for our meta-classifier
X_test = test['body_text']

## Feature Selection
- Based on the presence of a word (rather than frequency
- `binary=True` flag so all non-zero counts are one

In [4]:
def remove_links(text):
    return re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', text)

def remove_punc(text):
    return ''.join([i.casefold() for i in text if i not in punctuation])

def remove_num(text):
    return ''.join([i for i in text if not i.isdigit()])

def remove_unicode(text):
    return ' '.join([i for i in text.split() if '\\' not in i])

def remove_stopwords(text):
    return ' '.join([wordnet.lemmatize(i) for i in text.split() if i not in stopwords])

def preprocess(text):
    no_links = remove_links(text)  
    no_unicode = remove_unicode(no_links)
    no_punc = remove_punc(no_unicode)
    no_num = remove_num(no_punc)
    cleaned = remove_stopwords(no_num)
    return tknzr.tokenize(cleaned)

In [5]:
binary_vectorizer = CountVectorizer(analyzer=preprocess, binary=True, min_df=2).fit(X_train)
X_binary = binary_vectorizer.transform(X_train) # word in presence of label

# should be a list of unique words in every train tweet 
features = binary_vectorizer.get_feature_names() 

# len(features) 
# run through says we have 105006 unique words (using preprocess function)
# the preprocessed data they provided had 184674 unqiue words so we've simplified a lot more

### Results
- Top n% features
- Using frequency (`CountVectorizer`)
- Using `tfidf`

- Naive Bayes performs well with `tfidf`, but as expected does much better with frequencies (more consistent and higher performace).
- SVM performs marginally better with `tfidf` and handles several features well (more consistent and higher performance).
- Logistic seems to initially perform well, but since we use a `multinomial` distribution it does much better with frequencies (not as consistent but if were to take it, frequency is better).

In [7]:
k_best = SelectFpr(chi2, alpha=0.05).fit(X_binary, y_train)
fpr_chi2 = [features[i] for i in k_best.get_support(indices=True)]

In [None]:
k_best = SelectPercentile(mutual_info_classif, percentile=70).fit(X_binary, y_train)
sp_mi = [features[i] for i in k_best.get_support(indices=True)]

In [None]:
k_best_features = set(fpr_chi2+sp_mi)
# output to text so we don't have to keep redoing this step...
with open('set_best_features.txt', 'w') as f:
    for item in k_best_features:
        f.write("%s\n" % item)

In [None]:
len(fpr_chi2), len(sp_mi), len(k_best_features)

In [None]:
K_BEST_FEATURES = list()
with open('set_best_features.txt', 'r') as f:
    for line in f:
        K_BEST_FEATURES.extend(line.strip().split('\n'))

def feature_select(text):
    no_links = remove_links(text)  
    no_unicode = remove_unicode(no_links)
    no_punc = remove_punc(no_unicode)
    no_num = remove_num(no_punc)
    cleaned = remove_stopwords(no_num)
    tokens = tknzr.tokenize(cleaned)
    return [i for i in tokens if i in K_BEST_FEATURES]

### Feature Selection (freq)
- NB
- LR

Use the top 80% features (This is the freq variant of our data set)

In [None]:
vectorizer = CountVectorizer(analyzer=feature_select).fit(X_train)
X_train_freq = vectorizer.transform(X_train)
X_dev_freq = vectorizer.transform(X_dev) # We fit Dev to Train
X_test_freq = vectorizer.transform(X_test) # We fit Test to Train

### Feature Selection (`tfidf`)
- SVM

Use the top 80% features
(This is the `tfidf` variant of our data set)

In [None]:
tfidf = TfidfTransformer() # transform frequency to tfidf
X_train_tfidf = tfidf.fit_transform(X_train_freq)
X_dev_tfidf = tfidf.fit_transform(X_dev_freq)
X_test_tfidf = tfidf.fit_transform(X_test_freq)

## Run our Classifiers
- Including lemmatization of tweets
- `y_pred` is our prediction on the real `test` data

In [None]:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression 

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV

In [None]:
def report(clf, X_test, y_test):
    # generates a report summary
    y_pred = clf.predict(X_test)
    print(classification_report(y_test, y_pred,target_names=category))
    print(f'Accuracy: {100*accuracy_score(y_pred, y_test):.2f}%')
    df = pd.DataFrame(confusion_matrix(y_test, y_pred, labels=category), index=category, columns=category)
    sns.heatmap(df, annot=True, fmt='d')
    plt.title("Confusion Matrix")
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.yticks(rotation=0)
    plt.show()
    return y_pred

def random_search_tune(X_train, y_train, estimator, parameters):
    # randomised search for hyperparameters
    rs = RandomizedSearchCV(estimator, parameters, n_iter=50, cv=5, random_state=0, n_jobs=-1)
    rs_fit = rs.fit(X_train, y_train)
    display(rs_fit.cv_results_)
    return rs_fit.best_params_

### Multinomial Naive Bayes
- Uses a frequency based feature space
- Takes the top 80% features

In [None]:
print('Multinomial NB')
clf1 = MultinomialNB().fit(X_train_freq, y_train)
report(clf1, X_dev_freq, y_dev)
to_output = clf1.predict(X_test_freq)

In [None]:
pd.DataFrame(to_output, index=test.index,columns=['Class']).to_csv("fpr_sp_preds.csv")

### Support Vector Machines (One Vs Rest)
- Uses a `tfidf` feature space
- Takes the top 80% features

In [None]:
print('One vs Rest SVM') 
clf2 = LinearSVC(max_iter=10000, random_state=0, tol=1e-05).fit(X_train_tfidf, y_train)
svm_pred = report(clf2, X_dev_tfidf, y_dev)

### Multi-Class Logistic Regression
- Uses a frequency based feature space
- Takes the top 80% features

In [None]:
print('Logistic Regression (Multi-Class)') 
clf3 = LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1000).fit(X_train_freq, y_train)
report(clf3, X_dev_freq, y_dev)

## Stacked Ensemble Learner

In [None]:
from mlxtend.classifier import StackingCVClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
stacked_clf1 = RandomForestClassifier(n_estimators=100, min_samples_split=50, 
                              min_samples_leaf=1, criterion='gini', n_jobs=-1)
stacked_clf2 = MultinomialNB()
stacked_clf3 = LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1000, n_jobs=-1)
lr = LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1000, n_jobs=-1)

sclf = StackingCVClassifier(classifiers=[stacked_clf1, stacked_clf2, stacked_clf3],
                            meta_classifier=lr, use_probas=True)

In [None]:
sclf.fit(X_train_tfidf, y_train.replace(category,[0,1,2,3]))

In [None]:
preds = pd.Series(sclf.predict(X_dev_tfidf)).replace([0,1,2,3], category)
print(classification_report(y_dev, preds,target_names=category))
print(f'Accuracy: {100*accuracy_score(preds, y_dev):.2f}%')
df = pd.DataFrame(confusion_matrix(y_dev, preds, labels=category), index=category, columns=category)
sns.heatmap(df, annot=True, fmt='d')
plt.title("Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.yticks(rotation=0)
plt.show()

In [None]:
pd.DataFrame(pd.Series(sclf.predict(X_test_tfidf)).replace([0,1,2,3], category),index=test.index).to_csv("fixed_80_ensemble.csv")