## Import Libraries
- Will automatically download dependencies if not installed

In [None]:
import pandas as pd
import numpy as np

import nltk
nltk.download('stopwords')
nltk.download('wordnet')

import re
import string

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif

category = ['Sydney', 'Melbourne', 'Brisbane', 'Perth']

## Prepping RAW tweet data
- `tsv` format so the flag `sep='\t'` needs to be passed
- On the fly cleaning (`reset_index`, `columns`)
- Assign `stopwords`, `wordnet` and `punctuation`

In [None]:
train = pd.read_csv('train-raw.tsv', sep='\t', names=['label', 'body_text']).iloc[1:].reset_index(drop=True)

dev = pd.read_csv('dev-raw.tsv', sep='\t').reset_index()
dev.drop('level_0', axis=1, inplace=True)
dev.columns = ['label', 'body_text']

test = pd.read_csv('test-raw.tsv', sep='\t').reset_index(drop=True)
test.columns = ['body_text']

stopwords = nltk.corpus.stopwords.words('english')
wordnet = nltk.WordNetLemmatizer()
punctuation = string.punctuation

## Tokenize and Clean the Body Text
1. Strip all punctuation
2. Tokenize the body text using regex
3. Strip all stopwords found in the `nltk`'s corpus
4. Lemmatize the tokenized body text (rather than relying on simple numerical values for a split)

In [None]:
def clean(text):
    punc_strip = ''.join([char.casefold() for char in text if char not in punctuation])
    tokens = re.split('\W+', punc_strip)
    stopword_strip = [word for word in tokens if word not in stopwords]
    lemma = [wordnet.lemmatize(word) for word in stopword_strip]
    return lemma

## Splitting the Data Set
- Train, Development, Test

In [None]:
X_train, y_train = train['body_text'], train['label']
X_dev, y_dev = dev['body_text'], dev['label']
X_test = test['body_text']
del train, dev, test

In [None]:
vectorizer = CountVectorizer(analyzer=clean).fit(X_train) # fit to x_train
X_train = vectorizer.transform(X_train)
X_dev = vectorizer.transform(X_dev)
X_test = vectorizer.transform(X_test)

## Feature Selection
1. Vectorize the data using CountVectorizer
2. Select the $k=10000$ most features that have the best $\chi^2$ score
3. Output to new csv file

In [None]:
ch2 = SelectKBest(chi2, k=10000).fit(X_train, y_train)
X_train_new = ch2.transform(X_train)
X_dev_new = ch2.transform(X_dev) # new X_dev is chosen
X_test_new = ch2.transform(X_test)

features_chosen = np.array([vectorizer.get_feature_names()[i] for i in ch2.get_support(indices=True)])

X_train_new = pd.DataFrame(X_train_new.toarray(), columns=features_chosen)
X_dev_new = pd.DataFrame(X_dev_new.toarray(), columns=features_chosen)
X_test_new = pd.DataFrame(X_test_new.toarray(), columns=features_chosen)

mic = SelectKBest(mutual_info_classif, k=1000).fit(X_train, y_train)
X_train_new = mic.transform(X_train)
X_dev_new = mic.transform(X_dev) # new X_dev is chosen
X_test_new = mic.transform(X_test)

features_chosen = np.array([vectorizer.get_feature_names()[i] for i in mic.get_support(indices=True)])

X_train_new = pd.DataFrame(X_train_new.toarray(), columns=features_chosen)
X_dev_new = pd.DataFrame(X_dev_new.toarray(), columns=features_chosen)
X_test_new = pd.DataFrame(X_test_new.toarray(), columns=features_chosen)

In [None]:
X_train_new['label'] = y_train
X_dev_new['label'] = y_dev

In [None]:
X_train_new.to_csv('train_k_best_10000.csv',sep=',')
X_dev_new.to_csv('dev_k_best_10000.csv',sep=',')
X_test_new.to_csv('test_k_best_10000.csv',sep=',')