## Please complete before submission:

**Name:** Shambhavi Singh

**Student Number:** 2711327S

# TextAs Data Coursework

Please fill in this Google Colab by following the prompts from the coursework specification document and inserting your code in each relevant section.

- You should submit this notebook together with your report. (Two separate files)
- Specifcally you will submit **both a PDF with the report (so we can easily read it) and a .ipynb file containing the source code of your experiments as evidence**. Please do clean up your code where possible before submitting it.
- You may add whatever additional code and text blocks as needed (perhaps with [nice formatting](https://colab.research.google.com/notebooks/markdown_guide.ipynb)). Please keep the major headings (for the question numbers) the same.

As with labs, please remember to **Save a Copy to Drive** when you start working on this so that it is saved. Completing the labs provides essential knowledge for the successful completion of the coursework. 

## Downloading and loading Data

This code loads the prepared split of the Reddit data into training, validation and testing set.

In [3]:
!wget -O reddit_data_split.zip https://gla-my.sharepoint.com/:u:/g/personal/jake_lever_glasgow_ac_uk/EapVNOIV84tPnQuuFBNgG9UBYIWipQ9JL4QTfSgRtIacBw?download=1
!unzip -o reddit_data_split.zip

zsh:1: no matches found: https://gla-my.sharepoint.com/:u:/g/personal/jake_lever_glasgow_ac_uk/EapVNOIV84tPnQuuFBNgG9UBYIWipQ9JL4QTfSgRtIacBw?download=1
unzip:  cannot find or open reddit_data_split.zip, reddit_data_split.zip.zip or reddit_data_split.zip.ZIP.


In [2]:
import json

with open('reddit_train.json') as f:
    train_data = json.load(f)
with open('reddit_val.json') as f:
    validation_data = json.load(f)
with open('reddit_test.json') as f:
    test_data = json.load(f)

print("Number of posts in training data:", len(train_data))
print("Number of posts in validation data:", len(validation_data))
print("Number of posts in test data:", len(test_data))

FileNotFoundError: [Errno 2] No such file or directory: 'reddit_train.json'

## Q1:

### Q1a:

In [None]:
#Grouping the dataset according to the subreddit
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#Grouping the train data
train_data_df = pd.DataFrame(train_data)

train_data_df1=train_data_df.groupby(["subreddit"]).count()
print("\nTrain data: ")
print(train_data_df1['title'])

#Grouping the validation data
validation_data_df = pd.DataFrame(validation_data)
validation_data_df1=validation_data_df.groupby(["subreddit"]).count()
print("\nValidation data: ")
print(validation_data_df1['title'])

#Grouping the test data
test_data_df = pd.DataFrame(test_data)
test_data_df1=test_data_df.groupby(["subreddit"]).count()
print("\nTest data: ")
print(test_data_df1['title'])


### Q1b:

Preprocessing

In [None]:
import spacy

# Load the medium english model. 
# We will use this model to get embedding features for tokens later.
#!python -m spacy download en_core_web_md

nlp = spacy.load('en_core_web_sm', disable=['ner'])
nlp.remove_pipe('tagger')
nlp.remove_pipe('parser')

# Download a stopword list
import nltk
nltk.download('stopwords')

In [None]:
#Preprocessing of the data
def text_pipeline_spacy(text):
    tokens = []
    doc = nlp(text)
    for t in doc:
       if not t.is_stop and not t.is_space:
            tokens.append(t.lemma_.lower())
    return tokens


Creating the evaluation summary function.

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import fbeta_score

def evaluation_summary(description, true_labels, predictions):

  print("Evaluation for: " + description)
  precision = precision_score(predictions, true_labels, average='weighted')
  recall = recall_score(predictions, true_labels, average='weighted')
  accuracy = accuracy_score(predictions, true_labels)
  print("Classifier '%s' has Acc=%0.3f P=%0.3f R=%0.3f" % (description,accuracy,precision,recall))

  print(classification_report(true_labels, predictions,  digits=3, zero_division=0)) 
  print('\nConfusion matrix:\n',confusion_matrix(true_labels, predictions)) # Note the order here is true, predicted


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Creating a one-hot encoding vectorizer.
one_hot_vectorizer = CountVectorizer(tokenizer=text_pipeline_spacy, binary=True)

# This creates input features for our classification on all subsets of our collection.
train_features = one_hot_vectorizer.fit_transform(train_data_df['body'].tolist())
validation_features = one_hot_vectorizer.transform(validation_data_df['body'].tolist())
test_features = one_hot_vectorizer.transform(test_data_df['body'].tolist())

train_labels = train_data_df['subreddit']
validation_labels = validation_data_df['subreddit']
test_labels = test_data_df['subreddit']

Implementation of the classifiers :-   
  
(i) Dummy Classifier with strategy = "most frequent"

In [None]:
from sklearn.dummy import DummyClassifier

mf_dummy = DummyClassifier(strategy='most_frequent')

#Training on the train data
mf_dummy.fit(train_features, train_labels)

#Evaluation on test data
mf_dummy_predicted_labels = mf_dummy.predict(test_features)
evaluation_summary("Dummy Majority (Test Data)", test_labels, mf_dummy_predicted_labels)

#Evaluation on train data
mf_dummy_predicted_labels_train = mf_dummy.predict(train_features)
evaluation_summary("Dummy Majority (Train Data)", train_labels, mf_dummy_predicted_labels_train)


(ii) Dummy Classfier with strategy = "stratified"

In [None]:
dummy_prior = DummyClassifier(strategy='stratified')

#Training on the train data
dummy_prior.fit(train_features, train_labels)

#Evaluation on the test data
dummy_prior_predicted_labels = dummy_prior.predict(test_features)
evaluation_summary("Dummy Prior (Test Data)", test_labels, dummy_prior_predicted_labels)

#Evaluation on train data
dummy_prior_predicted_labels_train = mf_dummy.predict(train_features)
evaluation_summary("Dummy Majority (Train Data)", train_labels, dummy_prior_predicted_labels_train)


LogisticRegression with One-hot vectorization

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()

#Training on the train data
lr_model = lr.fit(train_features, train_labels)

#Evaluation on the test data
lr_predicted_labels = lr_model.predict(test_features)
evaluation_summary("LR onehot (Test Data)", test_labels, lr_predicted_labels)

#Evaluation on train data
lr_predicted_labels_train = lr_model.predict(train_features)
evaluation_summary("LR onehot (Train Data)", train_labels, lr_predicted_labels_train)


LogisticRegression with TF-IDF vectorization

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy

#Creating a TF-IDF vetorizer
tf_vectorizer = TfidfVectorizer(tokenizer=text_pipeline_spacy, binary=True)

# This creates input features for our classification on all subsets of our collection.
train_features_lr = tf_vectorizer.fit_transform(train_data_df['body'].tolist())
validation_features_lr = tf_vectorizer.transform(validation_data_df['body'].tolist())
test_features_lr = tf_vectorizer.transform(test_data_df['body'].tolist())

lr = LogisticRegression()

#Training on the train data
lr_model = lr.fit(train_features_lr, train_labels)

#Evaluation on the test data
lr_predicted_labels_tf_idf = lr_model.predict(test_features_lr)
evaluation_summary("LR TF-IDF (Test Data)", test_labels, lr_predicted_labels_tf_idf)


#Evaluation on the train data
lr_predicted_labels_train = lr_model.predict(train_features_lr)
evaluation_summary("LR TF-IDF (Train Data)", train_labels, lr_predicted_labels_train)



SVC Classifier with One-hot vectorization (SVM with RBF kernel, default settings)

In [None]:
from sklearn import svm

lr = svm.SVC(kernel='rbf')

#Training on the train data
lr_model = lr.fit(train_features, train_labels)

#Evaluation on the test data
svc_predicted_labels = lr_model.predict(test_features)
evaluation_summary("SVC onehot (Test Data)", test_labels, svc_predicted_labels)

#Evaluation on the train data
lr_predicted_labels_train = lr_model.predict(train_features)
evaluation_summary("SVC onehot (Train Data)", train_labels, lr_predicted_labels_train)


Plotting the graph of f1 score vs subreddit for the best performing classifier i.e., Logistic Regression with TF-IDF vectorization .

In [None]:
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt

#Graph plot for the best performing classifier above
labels = ['Coffee','HydroHomies','NinetendoSwitch','PS4','Soda','antiMLM','pcgaming','tea','xbox']
f1score = f1_score(lr_predicted_labels_tf_idf, test_labels, average=None)
fig = plt.figure(figsize=(8,8))
ax=fig.add_subplot(111)
plt.bar(labels, f1score)
ax.set_xticklabels(labels, rotation=90)
plt.title("Graph plot for the best performing classifier")
plt.xlabel("subreddit", fontsize=15)
plt.ylabel("F1-score", fontsize=15)
plt.tight_layout()

### Q1c:

Own classifier/tokenization/normalisations approach,

In [None]:
from sklearn.naive_bayes import MultinomialNB

classifier = MultinomialNB()

#Training on the tain data
nb_model = classifier.fit(train_features_lr, train_labels)

#Evaluation on the test data
test_predicted_labels = nb_model.predict(test_features_lr)
evaluation_summary("One-hot Naive Bayes (Test Data)",  test_predicted_labels, test_labels)

#Evaluation on the train data
train_predicted_labels = nb_model.predict(train_features_lr)
evaluation_summary("One-hot Naive Bayes (Train Data)",  train_predicted_labels, train_labels)


## Q2:

### Q2a:

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class Selector(BaseEstimator, TransformerMixin):
    """For data grouped by feature, select subset of data at a provided key.    """

    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]

In [None]:
from sklearn.pipeline import Pipeline

prediction_pipeline = Pipeline([
              ('selector', Selector(key='body')),
              ('tf-idf', TfidfVectorizer()),
              ('logreg', LogisticRegression())
              ])

In [None]:
import numpy as np 
from sklearn.model_selection import GridSearchCV
params={
        'logreg__penalty' : ['l2'], 
        'logreg__C'       : np.logspace(-3,3,7),
        'logreg__solver'  : ['liblinear'],     
        'logreg__max_iter': [10000],
        'tf-idf__max_features' : [10000],
        'tf-idf__sublinear_tf': [True] 
        }

grid_search = GridSearchCV(prediction_pipeline, param_grid = params, n_jobs=1, verbose=1, scoring='f1_macro', cv=2)

print("Performing grid search...")
print("pipeline:", [name for name, _ in prediction_pipeline.steps])
print("parameters:")
print(params)

#Trained on validation data
grid_model=grid_search.fit(validation_data_df, validation_labels)
grid_predicted_labels = grid_model.predict(test_data_df)

print("Best score: %0.3f" % grid_model.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(params.keys()):
  print("\t%s: %r" % (param_name, best_parameters[param_name]))

Testing the model again after tuning the parameters.

In [None]:
#Creating a TF-IDF vetorizer
tf_vectorizer = TfidfVectorizer(tokenizer=text_pipeline_spacy, max_features=10000,sublinear_tf=True)

# This creates input features for our classification on all subsets of our collection.
train_features_lr = tf_vectorizer.fit_transform(train_data_df['body'].tolist())
validation_features_lr = tf_vectorizer.transform(validation_data_df['body'].tolist())
test_features_lr = tf_vectorizer.transform(test_data_df['body'].tolist())

lr = LogisticRegression()

#Training on the train data
lr_model = lr.fit(train_features_lr, train_labels)

#Evaluation on the test data
lr_predicted_labels_tf_idf = lr_model.predict(test_features_lr)
evaluation_summary("LR TF-IDF (Test Data) After Tuning", test_labels, lr_predicted_labels_tf_idf)

#Evaluation on the train data
lr_predicted_labels_train = lr_model.predict(train_features_lr)
evaluation_summary("LR TF-IDF (Train Data) After Tuning", train_labels, lr_predicted_labels_train)


### Q2b:

Error Analysis: Explored the mismatch for labels between the predictions and true values.

In [None]:
import pandas as pd

predicted_list= lr_predicted_labels_tf_idf.tolist()
label_test_list=test_labels.tolist()
mismatch=[i for i, j in zip(predicted_list, label_test_list) if  i!=j]
print("Mismatch has occured for the these labels:",mismatch)
print("Count of the labels mimatched:",len(mismatch))

predicted_list1 = pd.Series(lr_predicted_labels_tf_idf) 
dummy_df = test_data_df
dummy_df = dummy_df.assign(predict=predicted_list1.values)
dummy_df = dummy_df[dummy_df['subreddit']!= dummy_df['predict']]
dummy_df[['subreddit', 'predict','body']]

# Counting label mismatch ,Code reference from: https://datascience.stackexchange.com/questions/37899/sklearn-svm-how-to-get-a-list-of-the-wrong-predictions

## Q3:

### Q3a:

Features selected to add in the classifiers and the reason behind their selction :-

**title** : The title of the post. This is interrelated with body, so it may help in the training of the model.

**author** : the username of the poster. The same sentiment polarity may appear when the same user appears.

### Q3b:

In [None]:
# add only title
from sklearn.pipeline import FeatureUnion, Pipeline

prediction_pipeline_union = {
    'title_feature': Pipeline([
        ('union', FeatureUnion(
          transformer_list=[
            ('title', Pipeline([
              ('selector', Selector(key='title')),
              ('tfidf', TfidfVectorizer(tokenizer=text_pipeline_spacy,sublinear_tf=True, max_features=10000, max_df=1200)), 
              ])),
            ('body', Pipeline([
              ('selector', Selector(key='body')), 
              ('tfidf', TfidfVectorizer(tokenizer=text_pipeline_spacy,sublinear_tf=True, max_features=10000, max_df=1200)), 
              ])),
        ])
        )
    ]),

    'author_feature': Pipeline([
        ('union', FeatureUnion(
          transformer_list=[
            ('author', Pipeline([
              ('selector', Selector(key='author')),
              ('tfidf', TfidfVectorizer(tokenizer=text_pipeline_spacy,sublinear_tf=True, max_features=10000, max_df=1200)), 
              ])),
            ('body', Pipeline([
              ('selector', Selector(key='body')), 
              ('tfidf', TfidfVectorizer(tokenizer=text_pipeline_spacy,sublinear_tf=True, max_features=10000, max_df=1200)), 
              ])),
        ])
        )
    ]),

    'combined_feature': Pipeline([
        ('union', FeatureUnion(
          transformer_list=[
            ('title', Pipeline([
              ('selector', Selector(key='title')),
              ('tfidf', TfidfVectorizer(tokenizer=text_pipeline_spacy,sublinear_tf=True, max_features=10000, max_df=1200)), 
              ])),
            ('author', Pipeline([
              ('selector', Selector(key='author')),
              ('tfidf', TfidfVectorizer(tokenizer=text_pipeline_spacy,sublinear_tf=True, max_features=10000, max_df=1200)), 
              ])),
            ('body', Pipeline([
              ('selector', Selector(key='body')), 
              ('tfidf', TfidfVectorizer(tokenizer=text_pipeline_spacy,sublinear_tf=True, max_features=10000, max_df=1200)), 
              ])),
        ])
        )
    ])
}

In [None]:
import warnings

warnings.filterwarnings('ignore')
for i in prediction_pipeline_union:
  print(i)
  prediction_pipeline_union[i].fit(train_data_df)
  tfidf_train=prediction_pipeline_union[i].transform(train_data_df)
  tfidf_test=prediction_pipeline_union[i].transform(test_data_df)

  lr_tuned = LogisticRegression(C=1000).fit(tfidf_train, train_labels)
  predicted=lr_tuned.predict(tfidf_test)
  evaluation_summary("\nLogistic Regession with TFIDF feature tuning",predicted ,test_labels)

Adding only one feature with body

In [None]:
# add only title
import pandas as pd
import numpy as np
import sklearn.metrics as metrics
from sklearn.compose import ColumnTransformer

text_features = ['body','title']
text_transformer = TfidfVectorizer(tokenizer=text_pipeline_spacy,sublinear_tf=True, max_features=10000, max_df=1200)

preprocessor = ColumnTransformer(
    transformers=[
        
        ('tfidf_1', text_transformer, 'body'),
        ('tfidf_2', text_transformer, 'title'),],
                    remainder='drop')


## Run evaluation with classifier
def evaluateClassifier(classif):
  clf = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', classif)])

  clf.fit(train_data_df, train_labels)
  y_pred = clf.predict(test_data_df)
  print(metrics.classification_report(test_labels, y_pred, zero_division=0))

evaluateClassifier(LogisticRegression(solver='saga', max_iter = 1000, C=100))



Adding two features ; title and author with body

In [None]:
# add title and author
import pandas as pd
import numpy as np
import sklearn.metrics as metrics
from sklearn.compose import ColumnTransformer

text_features = ['body','title','author']
text_transformer = TfidfVectorizer(tokenizer=text_pipeline_spacy,sublinear_tf=False, max_features=3000, max_df=1200)

preprocessor = ColumnTransformer(
    transformers=[
        ('tfidf_1', text_transformer, 'body'),
        ('tfidf_2', text_transformer, 'title'),
        ('tfidf_3', text_transformer, 'author')],
                    remainder='drop')


## Run evaluation with classifier
def evaluateClassifier(classif):
  clf = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', classif)])

  clf.fit(train_data_df, train_labels)
  y_pred = clf.predict(test_data_df)
  print(metrics.classification_report(test_labels, y_pred, zero_division=0))

evaluateClassifier(LogisticRegression(solver='saga', max_iter = 1000, C=100))



### Q3c: