In [1]:
import pandas as pd
import numpy as np

In [2]:
# Preprocessing of Data

import spacy
import numpy as np
nlp = spacy.load("en_core_web_sm",disable=["tagger", "parser"])
def preprocess(text):
    # TODO: Replace the next line with your own code.
    doc = nlp(text)
    token_list = []
    for token in doc:
        if token.is_stop == False and token.lemma_.isalpha() and len(token) > 3:
            token_list.append(token.lemma_)
    return(token_list)

In [4]:
def datareadiness(data,preproflag):
  # Removing null rows
  ind = np.where(data['body'].isnull() == True)[0].tolist()
  data = data.drop(ind)
  data = data.reset_index()

  # Preprocessing the text
  if preproflag=='Y':
    for i in range(len(data)):
      data['body'][i] = " ".join(preprocess(data['body'][i]))

    return data
  else:
    return data
  



In [15]:
from typing import Tuple, List
from sklearn.metrics import classification_report

class BiasClassifier:
    def __init__(self,model,vectorizer):
        self.model = model
        self.vectorizer = vectorizer
        
    
    def fit(self, train_file_path):
        """Train a classifier model after reading and extracting features from 
        train_file_path.

        Args:
            train_file_path: String path to the training data as a json, you 
            may assume instances have labels.

        Returns:
            A tuple of list of document id and prediction label and 
        """
        # TODO write code to extract features from train_file_path and 
        # train the model
        self.train_file_path = train_file_path
        train_data = pd.read_json(self.train_file_path)
        newtrain_data = datareadiness(train_data,'N')

        trainX = newtrain_data.body.values
        trainY = pd.factorize(newtrain_data.bias.values)[0]

        # Tranforming the text into vectorized form using CountVectorizer() and creating a pipeline of Vectorzer and CLassification method

        pipe = Pipeline([('countvectorizer', self.vectorizer),('XGB',self.model)])
        self.model_fit = pipe.fit(trainX,trainY)

        return self.model_fit
    
    def eval(self, val_file_path, model_fit):
    # -> Tuple[List[Dict[str, float]], classification_report]:
        """Evaluates the test data given in test_file_path after reading and
         extracting features.

        Args:
            test_file_path: String path to the test data, you may assume 
            instances have labels.

        Returns:
            A tuple of list of document id and prediction label and 
            evaluation summary in the form of sklearn classification_report.
        """
        # TODO write code to extract features from test_file_path and 
        # test the model
        self.val_file_path = val_file_path
        self.model_fit = model_fit
        val_data = pd.read_json(self.val_file_path)
        newval_data = datareadiness(val_data,'N')

        valX = newval_data.body.values
        valY = pd.factorize(newval_data.bias.values)[0]

        # Predicting the bias for validation dataset

        self.predeval = self.model_fit.predict(valX)
        self.predorglabel = ['Right' if v == 0 else 'Left' if v == 2 else 'Center' for v in self.predeval]
        self.dicteval = {}
        for i in range(len(self.predorglabel)):
          self.dicteval[newval_data['id'][i]] = self.predorglabel[i]
        
        self.summaryeval = classification_report(valY,self.predeval, target_names=['Right','Left','Center'])
        self.oplist = [self.dicteval,self.summaryeval]
        self.optuple = (self.oplist)
        return self.optuple



    def predict(self, test_file_path,model_fit): 
    # -> List[Dict[str, float]]:
        """Evaluates the test data given in test_file_path after reading and
         extracting features.

        Args:
            test_file_path: String path to the test data, the instances do not 
            have labels.

        Returns:
            A list of document id and prediction label.
        """
        # TODO write code to extract features from test_file_path and 
        # predict the labels for the model.
        self.test_file_path = test_file_path
        self.model_fit = model_fit
        test_data = pd.read_json(self.test_file_path)
        newtest_data = datareadiness(test_data,'N')

        testX = newtest_data.body.values
        # testY = pd.factorize(newtest_data.bias.values)[0]

        # Predicting the bias for validation dataset

        self.predtest = self.model_fit.predict(testX)
        self.predtestorglabel = ['Right' if v == 0 else 'Left' if v == 2 else 'Center' for v in self.predeval]
        self.dicttest = {}
        for i in range(len(self.predtestorglabel)):
          self.dicttest[newtest_data['id'][i]] = self.predtestorglabel[i]
        
        # self.summaryeval = classification_report(testY,self.predeval, target_names=['Right','Left','Center'])

        return [self.dicttest]


In [16]:
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

model = XGBClassifier()
vectorizer = CountVectorizer(ngram_range=(1,2))

clf = BiasClassifier(model,vectorizer)
model_fit = clf.fit(train_file_path = '/content/bias_articles_train.json')
opeval = clf.eval(val_file_path='/content/bias_articles_dev.json',model_fit = model_fit)
optest = clf.predict(test_file_path='/content/bias_articles_test.json',model_fit = model_fit)

In [17]:
print('Classification report of Validation dataset\n\n',opeval[1])

Classification report of Validation dataset

               precision    recall  f1-score   support

       Right       0.71      0.33      0.45        15
        Left       0.33      0.29      0.31         7
      Center       0.35      0.75      0.48         8

    accuracy                           0.43        30
   macro avg       0.47      0.46      0.41        30
weighted avg       0.53      0.43      0.43        30



In [18]:
print('Dictionary of document id of validation dataset and predicted labels \n',opeval[0])

Dictionary of documente id of validation dataset and predicted labels 
 {240: 'Right', 241: 'Left', 242: 'Left', 243: 'Left', 244: 'Center', 245: 'Center', 246: 'Left', 247: 'Left', 248: 'Left', 249: 'Right', 250: 'Left', 251: 'Right', 252: 'Center', 253: 'Left', 254: 'Left', 255: 'Left', 256: 'Left', 257: 'Center', 258: 'Center', 259: 'Left', 260: 'Center', 261: 'Left', 262: 'Right', 263: 'Right', 264: 'Left', 265: 'Left', 266: 'Left', 267: 'Left', 268: 'Right', 269: 'Right'}


In [19]:
print('Dictionary of document id of test dataset and predicted labels \n',optest)

Dictionary of document id of test dataset and predicted labels 
 [{270: 'Right', 271: 'Left', 272: 'Left', 273: 'Left', 274: 'Center', 275: 'Center', 276: 'Left', 277: 'Left', 278: 'Left', 279: 'Right', 280: 'Left', 281: 'Right', 282: 'Center', 283: 'Left', 284: 'Left', 285: 'Left', 286: 'Left', 287: 'Center', 288: 'Center', 289: 'Left', 290: 'Center', 291: 'Left', 292: 'Right', 293: 'Right', 294: 'Left', 295: 'Left', 296: 'Left', 297: 'Left', 298: 'Right', 299: 'Right'}]


The text classification task was handled in two directions.
This notebook (FactiversePrimary.ipynb) contains the final implementation of the best performing classifier method after hyper parameter tuning. Classifier training and prediction has been implemented in the BiasClassifier class as per the skeleton code given. Some slight modifications has been done to the skeleton code based on my understanding of OOP in python. The results of the final model are printed as in this notebook for your reference.

The second notebook (FactiverseSecondary.ipynb) contains the following,
* Initial data visulalization to check if the dataset is balanced amongst 3 classes. We can see from the histogram that count of speeches with 'Center' as the label is almost double compared to equally distributed count of speeches with label 'Right' and 'Left'. 
Since the total count of training dataset itself is very small (239), the need for balancing dataset with same count of speeches with their respective lables is not suggested (However, later in the training phase with multiple classifiers, after using the balanced dataset, the accuracy did not improve).

* The need for preprocessing the data in terms of sentence lemmatization, removing stop words, removing null records was carried out. The classifiers were trained first on original texts and later on preprocessed texts to find out which resulted in better accuracy.

* The initial approach in solving text classification problem was to try the baseline ML classification techniques. However, since the data is in the text format, the same had to be transformed or vectorized to train a classifier. Couuntvectorizer() method of transformation was choosed since it widely used in the of NLP and and one of the effective baseline method which uses word level tokenization. The numerical vector for each word is based on count of each word present in the sentence. The default hyper parameters were used except for ngram_range(since the results were better). Once the words in the speeches were transformed, the response labels were factorized to 0,1,2 from 'Right,Left,Center). A pipeline was built consisting of countvectorizer for transforming the data and the classifier.
Baseline methods like Logistic regression, Support vector classifier, Decision tree and XGBoost Classifer were built. XGB classifier outperformed rest of the classifiers resulting in slightly better accuracy but not the best I expected. The major reason which I believe is the length of the dataset. The dataset was very small and to expect a good result using baseline methods, the same has to be large enough at the least in thousands. 
Few of the hyper parmeters were tweaked in all classifiers but XGBClassifier with default parametes perfomed better than rest. 

* The second approach in solving this classification problem was to train the dataset on pre trained model BERT and fine tune it our task. BERT has a history of dealing well with smaller datasets since the pre trained model is already trained in billion of word corpus and incredibly famous because of self attention mechansim which captures the context of the sentence in both directions. Our dataset was fine tuned and transformed into tokens as expected by BERT. However, the end results were very bad even after tuning parameters like batch size, epochs, learning rate. The reason I believe is the same as mentioned earlier, is that the model cannot well trained enough with such less data. Another reason is the length of each sentence in the training dataset. Certain BERT model has a limitation in terms of sentence length (512) and since the average length of given sentences is around 3900 words. So it is important to pick words that are important in making classification and for task like text classification, one can argue that it would be computationaly expensive. Hence, BERT was not a good choice in this case and specific task. 

* The third approach was to try other deep learning models like regular word embedding, LSTM and GRU. Instead of vectorizing the words using countvectorizer(), an embedding vector is used to train the model. Embedding vector provides better sense of meaning to each word than count vectorizer(since it based on only count). Unfortunately, the results were not great and were close to results of XGB classifier but not better. The hyper parmeters in this case were not fine tuned because of the intution I had that it could not perform any better nor outperform XGB classifier and also computationaly quite expensive.

* Ultimately, after trying different models ranging from baseline to state of the art, for this particular task, XGB Classifier performed better. Since I am more of a functional programmer and based on my experience and understanding of using classes, I was able to implement the choosen method to fit inside the given code skeleton and the same can be found in FactiversePrimary.ipynb notebook. 

Note : 
* The code for implementing BERT,LSTM and GRU was borrowed from my own work which I had used for different text classification problem. The same can be verified in github account in NLPProjects repository. 
I have used Google colab for coding in both the notebooks, hence there could be spacing issues if opened in other environments also the path of the input files can change.

* I have promptly tried the best possible ways this problem can be solved and if I had missed out on something, I would be much interested in learning and improving my skills. Also, I would be happy to answer any question regarding the task and my choices.