In [None]:
!pip install spacy
!pip install sklearn
!pip install spacytextblob


import os
import csv
import spacy
import scipy
import pickle
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import RepeatedKFold
from sklearn.feature_extraction import DictVectorizer
from spacytextblob.spacytextblob import SpacyTextBlob
from sklearn.metrics import precision_score, recall_score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sklearn
  Downloading sklearn-0.0.post1.tar.gz (3.6 kB)
Building wheels for collected packages: sklearn
  Building wheel for sklearn (setup.py) ... [?25l[?25hdone
  Created wheel for sklearn: filename=sklearn-0.0.post1-py3-none-any.whl size=2344 sha256=6f75d66b35dc53c041471e4fc5f64009c6955736a672b1c30194adb1b0064254
  Stored in directory: /root/.cache/pip/wheels/14/25/f7/1cc0956978ae479e75140219088deb7a36f60459df242b1a72
Successfully built sklearn
Installing collected packages: sklearn
Successfully installed sklearn-0.0.post1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting spacytextblob
  Downloading spacytextblob-4.0.0-py3-none-any.whl (4.5 kB)
Installing collected packages: spacytextblob
Successfully installed spacy



In [None]:
#Path of the file containing data for the project. Feel free to change depending on where the data is
FILEPATH = 'subreddit_irony_data.csv'

In [None]:
#Function to read data from csv file and load into a pandas dataframe
def load_data():
  df=pd.read_csv(FILEPATH)
  return df

In [None]:
#Function to get the comment text from Parent comments (and Parent's parent's comments) recursively.
#Input Parameter:
#id: Comment ID of a comment
#df: The original complete dataframe with all the data
#Returns a sentence with comments from all the parents(ancestors).

def get_ancestors(id, df):
  df1 = df[df['comment_id'] == id]
  sen = ''
  par = set()
  for i in df1.index:
    sen = sen + ' ' + df1['comment'][i]

    # Store the parent IDs of the current comment in a set. This takes care of case where a given comment ID has more than 1 Parent IDs associated to it.
    if not pd.isna(df1['parent_id'][i]):
      par.add(df1['parent_id'][i])
  
  #Recursively calling the function for all of the Parent IDs related to this comment ID.
  for pid in par:
    sen = sen + ' ' + get_ancestors(pid, df)
  return sen

In [None]:
#Function to get all the comments appended with the comments from their parents
#Input Parameter:
#df: The original complete dataframe with all the data
#Returns a list of strings with each item being the original comment appended to all the parents' comments.

def nnp_sentence_prep(df):
  nnp_sentences = []
  for i in df.index:
    #Check if the comment has a label. If it does then concat the comment text and thread title together
    if not pd.isna(df['label'][i]):
      sen = df['comment'][i] + ' ' + df['thread_title'][i]

      #If the comment has a parent id then get the comments from all the ancestors and concat them to the current string
      if not pd.isna(df['parent_id'][i]):
        sen = sen + ' ' + get_ancestors(df['parent_id'][i], df)
      nnp_sentences.append(sen)

  return nnp_sentences

In [None]:
#Function to get the sentiment(positive/negetive) of all the rows that have a label
#Input Parameter:
#raw_sentences: a list of strings representing the comments
#Returns a list of strings with each item being the sentiment of the comment at that index.

def sentiment_gathering(raw_sentences):
  cache_path = "parsed_sentiments.pickle"
  nlp = spacy.load('en_core_web_sm')
  #Used spacytextblob to make sentiment prediction for the comments.
  nlp.add_pipe('spacytextblob')
  if os.path.exists(cache_path): 
    parsed_sentences = pickle.load(open(cache_path, 'rb'))
  else:
    parsed_sentences = []
    for i,r in enumerate(raw_sentences):
      parsed_sentences.append(nlp(r))
    if cache_path is not None:
      pickle.dump(parsed_sentences, open(cache_path, 'wb'))

  sent = []
  p = '+'
  n = '-'
  for tok in parsed_sentences:
    pol = tok._.blob.polarity
    pol = round(pol, 2)
    #If polarity is greater than 0 then classify the sentiment of the sentence as positive else negetive
    if pol > 0:
      sent.append(p)
    else:
      sent.append(n)

  return sent

In [None]:
#Function to preprocess the data for the NPP+ model
#Input Parameter:
#raw_sentences: a list of strings representing the comments concatenated with parents'(ancestors') comments
#sent: a list of strings with each item being the sentiment of the comment at that index.
#sub_redd: a list of strings with each item being the sub-reddit that the comment at that index belongs to.
#Returns a list of tokenized strings(list of list of strings) with each token being ("NNP+ sentiment sub-reddit").

def nnp_preprocessing(raw_sentences, sent, sub_redd):
  cache_path = "parsed_sentences.pickle"
  #Tokenizing comment sentences with spaCy
  nlp = spacy.load('en_core_web_sm')
  if os.path.exists(cache_path): 
    parsed_sentences = pickle.load(open(cache_path, 'rb'))
  else:
    parsed_sentences = []
    for i,r in enumerate(raw_sentences):
      parsed_sentences.append(nlp(r))
    if cache_path is not None:
       pickle.dump(parsed_sentences, open(cache_path, 'wb'))

  preproc_sentences = []

  for i, sentence in enumerate(parsed_sentences):
    preproc_tokens = []
    for token in sentence:
      #Using only tokens tagged as NNP
      if token.tag_ == 'NNP':
        #Setting each token as ("NNP+ sentiment sub-reddit")
        s = token.lemma_.lower() + ' ' + sent[i] + ' ' + str(sub_redd[i]).lower()
        preproc_tokens.append(s)
    
    #Adding sentiment as an independent feature at the end of the list for each sentence.
    preproc_tokens.append(sent[i])
    preproc_sentences.append(preproc_tokens)

  return preproc_sentences

In [None]:
#Function to preprocess the data for the Baseline model
#Input Parameter:
#raw_sentences: a list of strings representing the comment texts
#Returns a list of list of strings after splitting each string on white space.

def Baseline_preproc(raw_sentences):
  preproc_X = [ str(x).split() for x in raw_sentences ]
  return preproc_X

In [None]:
#Function to featurize the input for the model
#Input Parameter:
#preproc_X: Preprocessed Input for the model
#dv: a DictVectorizer object. Passed only for the test data. Created during train data.
#isTest: a boolean representing if the data is train data or test data
#Returns a tuple of featurized input and the DictVectorizer object

def featurize(preproc_X, dv=None, isTest = False):
  dicts = []
  for i in range(len(preproc_X)):
    #Count the frequency of each uni-gram in each pre-processed sentence and save it as a dictionary.
    cnt = Counter(preproc_X[i])
    #Append the dictionary to the list of dictionaries
    dicts.append(cnt)

    if len(preproc_X[i])>1:
      for j in range(len(preproc_X[i])-1):
        s = (str(preproc_X[i][j]) + "~" + str(preproc_X[i][j+1]))
        # For each bi-gram in a pre-processed sentence, increase its count in the dictionary if it is already preasent, otherwise initialize it to 1.
        if s in dicts[i]:
          dicts[i][s] += 1
        else:
          dicts[i][s] = 1
  
  dicts = np.array(dicts)
  if isTest is False:
    dv = DictVectorizer()
    #Featurize the pre-processed input
    X = dv.fit_transform(dicts)
    return X, dv
  else:
    return dv.transform(dicts), dv

In [None]:
#Function to create a Stocgastic Gradient Descent model.
#Input Parameter:
#alpha: The constant that multiplies the regularization term.
#Returns the model used for prediction

def create_classifier(alpha):
  #The loss used here is log-loss and the class weight has been forced to be balanced.
  clf = SGDClassifier(loss='log', class_weight='balanced', alpha=alpha)
  return clf

In [None]:
#Function to evaluate the model.
#Input Parameter:
#X: Test Data
#y_true: Labels
#model: the model to be evaluated
#Returns a tuple of precision and recall of the model

def evaluate_model(X, y_true, model):
  #Get predictions from the model so that it can be used to get other metrics
  y_pred = model.predict(X)
  #Precision Score
  prec = precision_score(y_true, y_pred)
  #Recall Score
  rec = recall_score(y_true, y_pred)
  return prec, rec

In [None]:
#Function to run repeated KFold cross validation on the data.
#Input Parameter:
#X: Pre-processed input data
#y: Labels
#k: Number of folds the data is supposed to be split into
#reps: Number of repetitions of KFold
#alpha: alpha parameter for the model
#Returns a tuple of a list of precision scores and a list of recall scores of the model

def run_rkfold_crossval(X, y, k=5, reps=2, alpha=1e-2):
  prec_score = []
  rec_score = []
  #Initialise RepeatedKFold object
  rkf = RepeatedKFold(n_splits=k, n_repeats=reps, random_state=2652124)
  y1 = np.array(y, dtype=object)
  #Create untrained model with chosen alpha
  clf_u = create_classifier(alpha)

  #This loops through the n_repeats(reps) iterations of KFold 
  #Each iteration of KFold loops loops through the entire dataset n_splits(k) times
  #Therefore, this loops through the entire dataset (n_repeats * n_splits) times = (reps * k) times
  for train_index , test_index in rkf.split(X):
    X = np.array(X, dtype=object)
    
    #Splitting the data in train and test.
    #k-1 portions are used for train and 1 for test
    #Since k is 5, the data is split as 80% train and 20% test
    X_train1 , X_test1 = X[train_index], X[test_index]
    y_train , y_test = y1[train_index] , y1[test_index]
    
    #Getting training and testing features from the data
    X_train_feat_spr, dv = featurize(X_train1)
    X_train_feat = scipy.sparse.csr_matrix.toarray(X_train_feat_spr)
    X_test_feat_spr, _ = featurize(X_test1, dv, True)
    X_test_feat = scipy.sparse.csr_matrix.toarray(X_test_feat_spr)

    #Train the model using training features and labels
    clf = clf_u.fit(X_train_feat, list(y_train))

    #Evaluate the model using test features and labels to get precision and recall
    prec, rec = evaluate_model(X_test_feat, list(y_test), clf)

    #Keep track of all the precision and recall
    prec_score.append(prec)
    rec_score.append(rec)

  return prec_score, rec_score

In [None]:
#Function to calculate statistics for a given list of metric scores.
#Input Parameter:
#score: list of metric scores
#Returns a tuple of mean, median, 25th percentile, 75th percentile values of the list of metric scores

def calc_stats(score):
  mean_val = np.mean(score)
  median_val = np.median(score)
  val_25 = np.percentile(score, 25)
  val_75 = np.percentile(score, 75)

  return mean_val, median_val, val_25, val_75

In [None]:
def main():
  #Number of folds the data is supposed to be split into
  K_FOLD = 5
  #Number of repetitions of KFold. The total number of iterations through the entire dataset is going to be (bow_reps * K_FOLD) = (4*5) = 20 ITERATIONS through the entire dataset
  bow_reps = 4
  #alpha parameter for the BOW model
  bow_alpha = 1e-2

  #Load the input data into pandas dataframe
  df1 = load_data()

  #Create another dataframe that doesn't have rows with no labels
  df = df1[df1['label'].notna()]

  #Get comments that have some label
  raw_sentences = df['comment']
  #Get labels associated with the comments
  labels = df['label']

  #Preprocess data for the Baseline BOW model
  preproc_baseline_sent = Baseline_preproc(raw_sentences)
  #Run RepeatedKFold on the the data, train the Baseline BOW model on the data and evaluate the model to get the precision and recall scores
  bow_baseline_model_prec_score, bow_baseline_model_rec_score = run_rkfold_crossval(preproc_baseline_sent, labels, k=K_FOLD, reps=bow_reps, alpha=bow_alpha)
  #Calculate the required stats on precision and recall for the BOW model
  bow_mean_precision, bow_median_precision, bow_25th_perc_precision, bow_75th_perc_precision = calc_stats(bow_baseline_model_prec_score)
  bow_mean_recall, bow_median_recall, bow_25th_perc_recall, bow_75th_perc_recall = calc_stats(bow_baseline_model_rec_score)

  #Number of repetitions of KFold. The total number of iterations through the entire dataset is going to be (nnp_reps * K_FOLD) = (10*5) = 50 ITERATIONS through the entire dataset
  nnp_reps = 10
  #alpha parameter for the NNP model
  nnp_alpha = 1e-1
  #Get sentiments for the sentences that have a label associated with them
  sent = list(sentiment_gathering(raw_sentences))
  #Use the entire dataframe (without removing the rows with NA labels) to get the comment texts for each comment sentence appended to the comment texts of its parents.
  nnp_raw_sentences = nnp_sentence_prep(df1)
  #List of subreddit which the comment belongs to
  sub_redd = list(df['subreddit'])
  #Preprocess data for the Baseline NNP model
  preproc_nnp_sent = nnp_preprocessing(nnp_raw_sentences, sent, sub_redd)
  #Run RepeatedKFold on the the data, train the NNP model on the data and evaluate the model to get the precision and recall scores
  np_prec_score, np_rec_score = run_rkfold_crossval(preproc_nnp_sent, labels, k=K_FOLD, reps=nnp_reps, alpha=nnp_alpha)
  #Calculate the required stats on precision and recall for the NNP model
  np_mean_precision, np_median_precision, np_25th_perc_precision, np_75th_perc_precision = calc_stats(np_prec_score)
  np_mean_recall, np_median_recall, np_25th_perc_recall, np_75th_perc_recall = calc_stats(np_rec_score)
  
  #Printing results

  def fformat(f):
    return "%.2f" % f

  print("Bag of Words Baseline")
  print("Precision")
  print(fformat(bow_mean_precision), fformat(bow_median_precision), fformat(bow_25th_perc_precision), fformat(bow_75th_perc_precision))
  print("Recall")
  print(fformat(bow_mean_recall), fformat(bow_median_recall), fformat(bow_25th_perc_recall), fformat(bow_75th_perc_recall))

  print("NP Sentiment Context Model")
  print("Precision")
  print(fformat(np_mean_precision), fformat(np_median_precision), fformat(np_25th_perc_precision), fformat(np_75th_perc_precision))
  print("Recall")
  print(fformat(np_mean_recall), fformat(np_median_recall), fformat(np_25th_perc_recall), fformat(np_75th_perc_recall))


if __name__ == "__main__":
  main()

Bag of Words Baseline
Precision
0.09 0.09 0.06 0.13
Recall
0.14 0.13 0.08 0.17
NP Sentiment Context Model
Precision
0.15 0.15 0.11 0.18
Recall
0.31 0.31 0.26 0.37


#<u>Output Metrics of the 2 Models:</u>

##<u>Bag of Words Baseline:</u>
###Precision
Mean: 0.09 | Median: 0.09 | 25th Percentile: 0.06 | 75th Percentile: 0.13
###Recall
Mean: 0.14 | Median: 0.13 | 25th Percentile: 0.08 | 75th Percentile: 0.17

##<u>NP Sentiment Context Model:</u>
###Precision
Mean: 0.15 | Median: 0.15 | 25th Percentile: 0.11 | 75th Percentile: 0.18
###Recall
Mean: 0.31 | Median: 0.31 | 25th Percentile: 0.26 | 75th Percentile: 0.37