## Downloading and loading Data

This code loads the prepared split of the Reddit data into training, validation and testing set.

In [None]:
!wget -O reddit_data_split.zip https://gla-my.sharepoint.com/:u:/g/personal/jake_lever_glasgow_ac_uk/EapVNOIV84tPnQuuFBNgG9UBYIWipQ9JL4QTfSgRtIacBw?download=1
!unzip -o reddit_data_split.zip

Downloaded Data is divided into 3 sets of data
1. Train Dataset
2. Test Dataset
3. Validation Dataset

In [None]:
import json

with open('reddit_train.json') as f:
    train_data = json.load(f)
with open('reddit_val.json') as f:
    validation_data = json.load(f)
with open('reddit_test.json') as f:
    test_data = json.load(f)

print("Number of posts in training data:", len(train_data))
print("Number of posts in validation data:", len(validation_data))
print("Number of posts in test data:", len(test_data))

##  Importing Required Packages


In [None]:
#!pip install -U spacy
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.naive_bayes import MultinomialNB
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import spacy
import requests
import time
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus.reader.knbc import test
import pandas as pd

# Load the medium english model. 
# We will use this model to get embedding features for tokens later.
#!python -m spacy download en_core_web_md

nlp = spacy.load('en_core_web_sm', disable=['ner'])
nlp.remove_pipe('tagger')
nlp.remove_pipe('parser')

nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('english')


## Exploratory Data Analysis

### Cleaning/Pre-processing Data

In [None]:

#Respective dataframes are created for each dataset
train_data = pd.DataFrame(train_data)
validation_data = pd.DataFrame(validation_data)
test_data = pd.DataFrame(test_data)

#removed duplicate titles
train_data.drop_duplicates(subset='title', inplace = True)
validation_data.drop_duplicates(subset='title', inplace = True)
test_data.drop_duplicates(subset='title', inplace = True)

# Check for the Null Entries
print("Train - Null Entries:")
print(train_data.isnull().sum())
print("Validation - Null Entries:")
print(validation_data.isnull().sum())
print("Test - Null Entries:")
print(test_data.isnull().sum())

### Printing first few rows of the train dataset

In [None]:
train_data.head()

### Histogram of train Dataset

In [None]:
train_data['subreddit'].value_counts().plot.bar()

### Histogram of Test Dataset

In [None]:
test_data['subreddit'].value_counts().plot.bar()

### Histogram of Validation Dataset

In [None]:
validation_data['subreddit'].value_counts().plot.bar()

### Mapping unique numbers to each subreddit label

In [None]:
unique_labels = train_data.subreddit.unique()
print(unique_labels)
unique_labels_dict = {}
unique_labels_rev_dict = {}

for i,l in enumerate(unique_labels):
  unique_labels_dict[l] = i
  unique_labels_rev_dict[i] = l
print(unique_labels_dict)
print(unique_labels_rev_dict)

In [None]:
train_data['labels'] = train_data['subreddit'].map(unique_labels_dict)
validation_data['labels'] = validation_data['subreddit'].map(unique_labels_dict)
test_data['labels'] = test_data['subreddit'].map(unique_labels_dict)

#train_data

In [None]:
train_data

### Splitting Train, Test and Validation datasets to X and Y

Here X would be body, and Y would be the subreddit we are trying to predict/classify

In [None]:
X_train = train_data['body']
Y_train = train_data['labels']

X_val = validation_data['body']
Y_val = validation_data['labels']

X_test = test_data['body']
Y_test = test_data['labels']

### A function to Tokenize text

In [None]:
def text_pipeline_spacy(text):
    tokens = []
    doc = nlp(text)
    for t in doc:
        if not t.is_stop and not t.is_punct and not t.is_space and t.lemma_.isalnum():
            tokens.append(t.lemma_.lower())
    tokens = ' '.join([char for char in tokens])
    return tokens


The posts in each subreddit is tokenized

In [None]:
X_train_tokenized = [ text_pipeline_spacy(x) for x in X_train ]
X_train_tokenized[0:5]

## Q1:

Use the text from the reddit posts (known as “body”) to train classification models using the Scikit Learn package. The labels to predict are the subreddit for each post. Conduct experiments using the following combinations of classifier models and feature representations:
1. Dummy Classifier with strategy="most_frequent"
2. Dummy Classifier with strategy="stratified"
3. LogisticRegression with One-hot vectorization 
4. LogisticRegression with TF-IDF vectorization (default settings)
5. SVC Classifier with  One-hot vectorization (SVM with RBF kernel, default settings))


### Importing Required Packages

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

### Transforming given text to Vectors based on the count of each word that occurs in the entire text

In [None]:
OH_transformer = CountVectorizer() #creates vectors on the basis of frequency of the occurence term
OH_transformer.fit(X_train_tokenized)

OHV_train = OH_transformer.transform(X_train)
OHV_test = OH_transformer.transform(X_test)
OHV_valid = OH_transformer.transform(X_val)

### Transforming given text to Vectors based on the TF-IDF weights

In [None]:
tfidf_transformer = TfidfVectorizer() #creates vectors on the basis of TF-IDF
tfidf_transformer.fit(X_train_tokenized)

tfidf_vec_train = tfidf_transformer.transform(X_train)
tfidf_vec_test = tfidf_transformer.transform(X_test)
tfidf_vec_valid = tfidf_transformer.transform(X_val)

### Q1a:
An important first step for any machine learning project is to explore the dataset. Calculate counts for the various
labels and comment on the distribution of labels in the training/validation/test sets

### Count of posts per subreddit for different datasets

In [None]:
train_data['subreddit'].value_counts().plot.bar()

In [None]:
test_data['subreddit'].value_counts().plot.bar()

In [None]:
validation_data['subreddit'].value_counts().plot.bar()

In [None]:
train_data
subreddit = train_data.subreddit.unique().tolist()
df_subr = train_data['subreddit'].value_counts()
df_subr.to_frame()
df_subr

In [None]:
test_data
subreddit = test_data.subreddit.unique().tolist()
df_subr_t = test_data['subreddit'].value_counts()
df_subr_t.to_frame()
df_subr_t

In [None]:
subreddit = validation_data.subreddit.unique().tolist()
df_subr_v = validation_data['subreddit'].value_counts()
df_subr_v.to_frame()
df_subr_v

### Q1b:
Implement the five classifiers above, train them on the training set and evaluate on the test set. Discuss the classifier performance in comparison to the others and preprocessing techniques.

For the above classifiers report the classifier accuracy as well as macro/weighted-averaged precision, recall, and F1 (to three decimal places). Show the overall results  obtained by the classifiers on the training and test sets in one table, and highlight the best performance. For the best performing classifier (by weighted F1 in test set) Include a bar chart graph with the F1 score for each class - (subreddits on x-axis, F1 score on Y axis).
Analyse and discuss the effectiveness of the classifiers. Your discussion should include how the models perform relative to the baselines and each other. It should discuss the classifiers’ behaviours with respect to: 
1. Appropriate model “fit” (how well is the model fit to the training/test dataset),
2. Dataset considerations (e.g. how are labels distributed, any other dataset issues?)
3. Classifier models (and their key parameters).


In [None]:
classifier_scores = pd.DataFrame(columns=['classifier', 'accuracy', 'precision', 'recall', 'f1_score'])

## Dummy Classifier 

with  strategy most_frequent

In [None]:
dummy_mf_model = DummyClassifier(strategy='most_frequent')
dummy_mf_model.fit(OHV_train, Y_train)
preds = dummy_mf_model.predict(OHV_test)
print(classification_report(Y_test, preds))


classifier_scores = classifier_scores.append({'classifier' : 'Dummy_classifier_Most_frequent',
                          'accuracy' : accuracy_score(Y_test, preds),
                          'precision' : precision_score(Y_test, preds, average='weighted'),
                          'recall' : recall_score(Y_test, preds, average='weighted'),
                          'f1_score' : f1_score(Y_test, preds, average='weighted')}, ignore_index=True)

## Dummy Classifier 

with  strategy stratified

In [None]:
dummy_mf_model = DummyClassifier(strategy='stratified')
dummy_mf_model.fit(OHV_train, Y_train)
preds = dummy_mf_model.predict(OHV_test)
print(classification_report(Y_test, preds))

classifier_scores = classifier_scores.append({'classifier' : 'Dummy_classifier_stratified',
                          'accuracy' : accuracy_score(Y_test, preds),
                          'precision' : precision_score(Y_test, preds, average='weighted'),
                          'recall' : recall_score(Y_test, preds, average='weighted'),
                          'f1_score' : f1_score(Y_test, preds, average='weighted')}, ignore_index=True)

## Logistic Regression

with one-hot Vectorization

In [None]:
lr_model_OHV = LogisticRegression(max_iter = 1000)
lr_model_OHV.fit(OHV_train, Y_train)
preds = lr_model_OHV.predict(OHV_test)
print(classification_report(Y_test, preds))

classifier_scores = classifier_scores.append({'classifier' : 'LogisticRegression - One Hot Vector',
                          'accuracy' : accuracy_score(Y_test, preds),
                          'precision' : precision_score(Y_test, preds, average='weighted'),
                          'recall' : recall_score(Y_test, preds, average='weighted'),
                          'f1_score' : f1_score(Y_test, preds, average='weighted')}, ignore_index=True)

## Logistic Regression

with TF-IDF Vectorization

In [None]:
lr_model_tfidf = LogisticRegression(max_iter = 1000)
lr_model_tfidf.fit(tfidf_vec_train, Y_train)
preds = lr_model_tfidf.predict(tfidf_vec_test)
report_lr = classification_report(Y_test, preds, output_dict = True)
print(classification_report(Y_test, preds))

classifier_scores = classifier_scores.append({'classifier' : 'LogisticRegression - TFIDF',
                          'accuracy' : accuracy_score(Y_test, preds),
                          'precision' : precision_score(Y_test, preds, average='weighted'),
                          'recall' : recall_score(Y_test, preds, average='weighted'),
                          'f1_score' : f1_score(Y_test, preds, average='weighted')}, ignore_index=True)

## SVC Classifier 

with  One-hot vectorization 

In [None]:
svc_tfidf_model = SVC(kernel='rbf')
svc_tfidf_model.fit(OHV_train, Y_train)
preds = svc_tfidf_model.predict(OHV_test)
print(classification_report(Y_test, preds))

classifier_scores = classifier_scores.append({'classifier' : 'SVC(rbf)',
                          'accuracy' : accuracy_score(Y_test, preds),
                          'precision' : precision_score(Y_test, preds, average='weighted'),
                          'recall' : recall_score(Y_test, preds, average='weighted'),
                          'f1_score' : f1_score(Y_test, preds, average='weighted')}, ignore_index=True)

### Comparison of performance of all models 

In [None]:
def highlight_max(s, threshold, column):
    is_max = pd.Series(data=False, index=s.index)
    is_max[column] = s.loc[column] >= threshold
    return ['background-color: yellow'if is_max.any() else''for v in is_max]

classifier_scores.style.apply(highlight_max, threshold=classifier_scores['f1_score'].max(), column=['f1_score'], axis=1)

### F1 Score for DIfferent labels

In [None]:
df_lr = pd.DataFrame(report_lr).transpose()
df_lr[:-3]["f1-score"].plot.bar()

### Q1c:
Choose your own classifier/tokenization/normalisations approach, and report on its performance with respect to the five previous ones on the test set.
You should describe your selected classifier and vectorization approach including a justification for its appropriateness.

In [None]:
from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier()
sgd.fit(tfidf_vec_train.toarray(), Y_train)
preds = sgd.predict(tfidf_vec_test.toarray())
print(classification_report(Y_test, preds))

classifier_scores = classifier_scores.append({'classifier' : 'SGDClassifier',
                          'accuracy' : accuracy_score(Y_test, preds),
                          'precision' : precision_score(Y_test, preds, average='weighted'),
                          'recall' : recall_score(Y_test, preds, average='weighted'),
                          'f1_score' : f1_score(Y_test, preds, average='weighted')}, ignore_index=True)

Function that returns the best performing model

In [None]:
classifier_scores.style.apply(highlight_max, threshold=classifier_scores['f1_score'].max(), column=['f1_score'], axis=1)

## Q2: Tuning and Error Analysis

In this task you will improve the effectiveness of the LogisticRegression with TF-IDF vectorization from Q1. 

### Q2a:

**Parameter** tuning - Tune the parameters for both the vectorizer and classifier on the validation set (or using CV-fold validation on the train).

* Classifier - Regularisation C value (typical values might be powers of 10 (from 10^-3 to 10^5)
* Vectorizer - Parameters: sublinear_tf and max_features (vocabulary size) (in a range None to 50k)
*  Select another parameter of your choice from the classifier or vectorizer

In [None]:
from sklearn.pipeline import Pipeline

pipe = Pipeline(steps=[('tfidf', TfidfVectorizer()), ('lr', LogisticRegression(max_iter = 1000))])

 

param_grid = {
    'tfidf__sublinear_tf': [True, False],
    'tfidf__max_features': [None, 500, 5000, 50000],
    'lr__C': [0.01, 0.1, 100, 1000, 10000],
    'lr__class_weight': [None, 'balanced'],
    'tfidf__ngram_range': [(1,1), (1,2)]
}

search = GridSearchCV(pipe, param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=3)
search.fit(X_train_tokenized, Y_train)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

In [None]:
tfidf_transformer = TfidfVectorizer(max_features=None, ngram_range=(1,2), sublinear_tf=True)
tfidf_transformer.fit(X_train_tokenized)

X_train_tfidf = tfidf_transformer.transform(X_train_tokenized)

lr_model = LogisticRegression(C=100, class_weight='balanced', max_iter=1000)
lr_model.fit(X_train_tfidf, Y_train)

### Q2b:

Error analysis - Manually examine the predictions of your optimised classifier on the test set.  Analyse the results for patterns and trends.  Hypothesise why common classification errors are made.  Report on your error analysis process and summarise your findings. 

In [None]:
X_test_transformed = tfidf_transformer.transform(X_test)

preds = lr_model.predict(X_test_transformed)
print(classification_report(Y_test, preds))


## Q3: Feature Engineering 

In this task your goal is to add two features to (try to) improve subreddit classification performance obtained in Q2.
You must implement and describe two new classifier features and add them to the tuned model from Q2. Examples include adding other properties of the posts, leveraging embedding-based features, different vectorization approaches, etc, (This is your chance to be creative!). As before, report the results in terms of evaluation metrics on the test data. Additionally, include a well-labelled confusion matrix and discuss the result in reference to Q2 and what helped (or didn’t) and why you think so. In summary: 


### Q3a:
Propose two features of your own, along with your rationale behind your choice. 

In [None]:
train_data['post_length'] = [len(x) for x in train_data['body']]
test_data['post_length'] = [len(x) for x in test_data['body']]
validation_data['post_length'] = [len(x) for x in validation_data['body']]

In [None]:
train_data.head()

In [None]:
X_train_combined = [ text_pipeline_spacy(x) for x in train_data['body'] + ' ' +train_data['title'] ]
X_test_combined = [ text_pipeline_spacy(x) for x in test_data['body'] + ' ' +test_data['title'] ]
X_validation_combined = [ text_pipeline_spacy(x) for x in validation_data['body'] + ' ' +validation_data['title'] ]

### Q3b:
Train, validate and test models that incorporate combinations of your features, and briefly report on the evaluation metrics 

In [None]:
tfidf_transformer_final = TfidfVectorizer(max_features=None, ngram_range=(1,2), sublinear_tf=True)
tfidf_transformer_final.fit(X_train_combined)

X_train_combined = tfidf_transformer_final.transform(X_train_combined)
X_test_combined = tfidf_transformer_final.transform(X_test_combined)
X_validation_combined = tfidf_transformer_final.transform(X_validation_combined)

X_train_combined.toarray()

In [None]:
import numpy as np
from scipy import sparse
from sklearn.preprocessing import normalize

XTRAIN = np.append( X_train_combined.toarray(), np.array(train_data['post_length']).reshape(-1,1),axis=1)
XTEST = np.append( X_test_combined.toarray(), np.array(test_data['post_length']).reshape(-1,1),axis=1)
XVALIDATION = np.append( X_validation_combined.toarray(), np.array(validation_data['post_length']).reshape(-1,1),axis=1)



XTRAIN = sparse.csr_matrix(XTRAIN)


XTRAIN.shape


In [None]:
lr_model = LogisticRegression(C=100, class_weight='balanced', max_iter=1000)
lr_model.fit(XTRAIN, Y_train)

In [None]:
preds = lr_model.predict(XTEST)

In [None]:
print(classification_report(Y_test, preds))