# Code For Best Models
> This notebook contains the code for the best 2 models developed

**NOTES:**  

1. First step will be to run `sh startup.sh` from the terminal to install all necessary dependencies  
2. The deep learning model requires a GPU to run
3. Several code lines commented out are not unwanted. Instead, uncommenting them could create different models

**A complete documentation of all models included is stored in [the github repository](https://github.com/lse-my474/classification-challenge-ry05/tree/main/code)**

The outputs of the hyperparameter tuning and also the ML model performances are documented in [here](https://github.com/lse-my474/classification-challenge-ry05/tree/main/code/models)

## Best Machine Learning Model

In [None]:
'''
Filename: pipelines.py

Not runnable individually. Preprocessing techniques used.
'''

"""
Preprocessing script
--------------------

This script contains elements that help in preprocessing the text
"""

import re
import string

import pandas as pd  
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
import texthero as hero
from texthero import stopwords, preprocessing 
from afinn import Afinn
afinn = Afinn()

DEF_STOPWORDS = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 'can', 'will', 'just', "don't", 'should', "should've", 'now', "aren't", "couldn't", "didn't", "doesn't", "hadn't", "hasn't", "haven't", "isn't", "mightn't", "mustn't", "needn't", "shan't", "shouldn't", "wasn't", "weren't", "won't", "wouldn't"]

class StemTokenizer(object):
    """
    Stems tokens
    """

    def __init__(self):
        self.stemmer = SnowballStemmer(language='english')

    def __call__(self, comment):
        return [self.stemmer.stem(token) for token in word_tokenize(comment)]

class LemmaTokenizer(object):
    """Lemmatizes tokens
    Source: https://stackoverflow.com/questions/47423854/sklearn-adding-lemmatizer-to-countvectorizer
    """

    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()

    def __call__(self, comment):
        return [self.lemmatizer.lemmatize(token) for token in word_tokenize(comment)]


def remove_punct(text):
    """
    Removes punctuations from text
    :param text: text (str)
    :return: text with no punctuations
    """

    return  re.sub(f'[{re.escape(string.punctuation)}]', '', text)

def bad_spaces(text):
    """
    Remove bad spaces
    :param text: text (str)
    :return: text with no bad spacing
    """

    tokens = text.split()
    text = " ".join(tokens)
    return text

def make_lower(text):
    """
    Convert to lowercase
    :param text: text (str)
    :return: text in lowercase
    """

    return text.lower()

def translate_text(text):
    """
    Translate the text by performing "special operations"
    :param text: text (str)
    :return: translated text
    """

    # corpus stopwords obtained from data exploration
    corpus_stopwords = ['fuck', 'fag', 'faggot', 'fggt', 'nigga', 'nigger', 'aids', 'article', 'page', 'wiki', 'wp', 'block', 'NOES', 'ANONYMOUS', 'UTC', 'NOT', 'OH', 'IP', 'POV', 'LIVE', 'WP', 'REDIRECT', 'BTW', 'AIDS', 'HUGE', 'BLEACHANHERO', 'PHILIPPINESLONG']
    cs_lower = [s.lower() for s in corpus_stopwords]
    cs_upper = [s.upper() for s in corpus_stopwords]

    you_tokens = ['you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves']
    stop_tokens = DEF_STOPWORDS
    
    # remove punctuations
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)

    # remove corpus stopwords
    # removing these won't affect as the presence of necessary words have been computed in data exploration
    # and the dataset is stored
    text_tokens = text.split()
    text_tokens = [tok for tok in text_tokens if ((tok not in cs_lower) and (tok not in cs_upper))]
    translated_tokens = []

    # add labels to select groups of words
    for token in text_tokens:
        if token in you_tokens:
            translated_tokens.append("YOUWORD")
        elif token in stop_tokens:
            translated_tokens.append("STOPWORD")
        else:
            translated_tokens.append(token)

    translated_text = " ".join(translated_tokens)

    return translated_text


In [None]:
'''
Filename: pipelines.py

Not runnable individually. Stores the machine learning pipelines.
'''

"""
Text Pipelines
--------------

The use of a `pipeline` is to chain several data operations together and
automate some or most parts of the ML process
Advantages of using a pipeline
1. Convenience
2. Prevents leakage
3. Easier hyperparameter tuning
Source: https://scikit-learn.org/stable/modules/compose.html

All pipelines in this file set after performing hyperparameter optimization
The code for hyperparameter optimization for the best model
is presented in `ml_best_hyp_opt.py`
"""

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import chi2
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from fastai.text.all import *

from preprocess import StemTokenizer
from feature_selection import UnivariateFeatureSelection

# fastai's tokenizer function
spacy = WordTokenizer()
tokenizer = Tokenizer(spacy)

# pipeline 1
pipeline_1 = Pipeline([
    ('countvec', CountVectorizer()),
    ('clf', MultinomialNB())
])

# pipeline 2
pipeline_2 = Pipeline([
    ('countvec', CountVectorizer(
        stop_words = 'english'
    )),
    ('clf', ComplementNB())
])

# pipeline 3
pipeline_3 = Pipeline([
    ('tfidf', TfidfVectorizer(
        tokenizer = StemTokenizer(),
        # tokenizer = tokenizer,
        # token_pattern = None,
        # lowercase = False,
        max_features=10000
    )),
    ('ufs', UnivariateFeatureSelection(
        n_features = 0.05,     # Top 5% of the features built
        scoring = 'chi2'     
    )),
    ('clf', ComplementNB(alpha=0.01))     # classifier
])

# pipeline 4
numeric_transformer = Pipeline([
    ('scaler', MinMaxScaler())
])

# transformer for text feature
# use string as vectorizer converts a single vector into multiple vectors
text_transformer = Pipeline([
    ('tfidf', TfidfVectorizer(
        # tokenizer = StemTokenizer(),
        # token_pattern = None,
        tokenizer = tokenizer,
        # lowercase = False,
        max_features=10000
    )),
    ('ufs', UnivariateFeatureSelection(
        n_features = 0.05,     # Top 5% of the features built
        scoring = 'chi2'     
    ))
])

# hardcoded
numeric = ['afinn', 'you_count', 'caps_word_count', 'digits_count', 'dale_chall']
binary = ['source_cnt', 'f*g_cnt', 'n***_cnt', 'fu**_cnt', 'article_cnt', 'REDIRECT_count'] # these are not preprocessed
text = 'text'

# preprocessor for heterogenous data
preprocessor = ColumnTransformer(
    transformers = [
        ('num', numeric_transformer, numeric),
        ('text', text_transformer, text)
])

# integrate it all into the pipeline
pipeline_4 = Pipeline([
    ('preprocess', preprocessor),
    ('clf', ComplementNB(alpha=0.5)) # ComplementNB was used
])


# pipeline dictionary
pipe_dict = {
    '1': pipeline_1,
    '2': pipeline_2,
    '3': pipeline_3,
    '4': pipeline_4
}

In [None]:
'''
Filename: train2.py

To run from terminal run this command: python train2.py --pipeline_number 4 --data with_numeric
Output style: 5-fold CV performance as tables
'''

"""
Training script
"""

import sys
import argparse

import pandas as pd 
from sklearn.model_selection import cross_validate

import config
import pipelines as pipe
from preprocess import translate_text

if __name__ == "__main__":

    # take in arguments from the terminal
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--pipeline_number",
        type=str
    )
    parser.add_argument(
        "--data",
        type=str
    )
    args = parser.parse_args()

    target = config.TARGET
    text_feat = config.TEXT

    # get the data
    if args.data == 'raw':
        train = pd.read_csv(config.TRAIN_RAW)
        train = train.fillna('missing')
        print('Preview of data')
        print(train.head())
        print()
        # split into independent and dependent variables
        train_x = train[text_feat]
        train_y = train[target].values
    elif args.data == 'with_numeric':
        train = pd.read_csv(config.TRAIN_NUMERIC)
        train = train.fillna('missing')
        print('Preview of data')
        print(train.head())
        print()
        # split into independent and dependent variables
        train_x = train.drop(['id', 'attack'], axis=1)
        train_y = train[target].values
    elif args.data == 'with_numeric_translated_text':
        train = pd.read_csv(config.TRAIN_NUMERIC)
        train = train.fillna('missing')
        print('Preview of data')
        print(train.head())
        print()
        # split into independent and dependent variables
        train[text_feat] = train[text_feat].apply(translate_text)
        train_x = train.drop(['id', 'attack'], axis=1)
        train_y = train[target].values    

    # instantiate the estimator
    if int(args.pipeline_number) < 4:
        if args.data != 'raw':
            print('Thou shall not use this combination of pipeline and data!')
            sys.exit(0)
        else:
            estimator = pipe.pipe_dict[args.pipeline_number]
    elif int(args.pipeline_number) == 4:
        print("Going ahead with this hoping you have rechecked `pipelines.py` w.r.t features considered!")
        if args.data == 'raw':
            print('Thou shall not use this combination of pipeline and data!')
            sys.exit(0)
        else:
            estimator = pipe.pipe_dict[args.pipeline_number]
    else:
        print('The pipeline or data or both have not yet been created!')
        sys.exit(0)

    print(f'Training {args.data} data with pipeline {args.pipeline_number} using 5-fold stratified cross validation...')
    print()

    # setup 5-fold stratified cross validation
    # stratified due to the presence of class imbalance
    cv_scores = cross_validate(estimator, train_x, train_y, cv=5, scoring=['f1', 'accuracy'], error_score='raise')
    performance_df = pd.DataFrame(
        dict(
            fit_time = cv_scores['fit_time'],
            score_time = cv_scores['score_time'],
            validation_acc = cv_scores['test_accuracy'],
            validation_f1 = cv_scores['test_f1']
        )
    )
    print(f'Performance Table')
    print('(All times in seconds)')
    print()
    print(performance_df)
    print()
    print(f'The mean CV F1 score is {performance_df.validation_f1.mean()}')
    print(f'The mean CV accuracy for 5-fold CV is {performance_df.validation_acc.mean()}')
    

In [None]:
'''
Filename: inference2.py

To run from terminal run this command: python inference2.py --pipeline_number 4 --data with_numeric
Output style: submission.csv file generated in ../data with predited labels for test data
'''

"""
Inference script
"""

import argparse

import pandas as pd 
from sklearn.model_selection import cross_val_score

import config
import pipelines as pipe
from preprocess import translate_text

if __name__ == "__main__":

    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--pipeline_number",
        type=str
    )
    parser.add_argument(
        "--data",
        type=str
    )
    args = parser.parse_args()

    target = config.TARGET
    text_feat = config.TEXT

    # get the data
    if args.data == 'raw':
        train = pd.read_csv(config.TRAIN_RAW)
        test = pd.read_csv(config.TEST_RAW)
        train = train.fillna('missing')
        test = test.fillna('missing')
        # split into independent and dependent variables
        train_x = train[text_feat]
        train_y = train[target].values
        test_x = test[text_feat]
    elif args.data == 'with_numeric':
        train = pd.read_csv(config.TRAIN_NUMERIC)
        test = pd.read_csv(config.TEST_NUMERIC)
        train = train.fillna('missing')
        test = test.fillna('missing')
        # split into independent and dependent variables
        train_x = train.drop(['id'], axis=1)
        train_y = train[target].values
        test_x = test[text_feat]
    elif args.data == 'with_numeric_translated_text':
        train = pd.read_csv(config.TRAIN_NUMERIC)
        test = pd.read_csv(config.TEST_NUMERIC)
        train = train.fillna('missing')
        test = test.fillna('missing')
        # split into independent and dependent variables
        train[text_feat] = train[text_feat].apply(translate_text)
        test[text_feat] = test[text_feat].apply(translate_text)
        train_x = train.drop(['id'], axis=1)
        train_y = train[target].values 
        test_x = train.drop(['id'], axis=1)  

    # instantiate the estimator
    if int(args.pipeline_number) < 4:
        if args.data != 'raw':
            print('Thou shall not use this combination of pipeline and data!')
            sys.exit(0)
        else:
            estimator = pipe.pipe_dict[args.pipeline_number]
    elif int(args.pipeline_number) == 4:
        print("Going ahead with this hoping you have rechecked `pipelines.py` w.r.t features considered!")
        if args.data == 'raw':
            print('Thou shall not use this combination of pipeline and data!')
            sys.exit(0)
        else:
            estimator = pipe.pipe_dict[args.pipeline_number]
    else:
        print('The pipeline or data or both have not yet been created!')
        sys.exit(0)

    # fit estimator on training data
    print("Fitting the estimator on the training data...")
    estimator.fit(train_x, train_y)

    print("Fitting complete.")
    print(train_x.shape)
    print(test_x.shape)

    # get predictions
    print("Predicting for the test data...")
    preds = estimator.predict(test_x)

    # make submission file
    ids = test['id'].values
    sub = pd.DataFrame({
        'id': ids,
        'attack': preds
    })
    sub.to_csv('../data/submission.csv', index=False)
    print("Prediction complete. Submission file generated in the data folder.")
    print(f"The submission has {sub.shape[0]} predictions")

## Best Deep Learning Model

In [None]:
'''
Filename: deeplearning_transformer.py
'''

"""
This file contains code to create the transformers
model that scored 0.782 on the public leaderboard

The model uses a RoBERTa base pretrained architecture with
a  RoBERTa model

This code was run in Google Colab to make use of GPUs

Therefore, running this code on a local system might throw issues
with paths of data

NOTE: Code has been written on the basis of documentation from
https://simpletransformers.ai/

The hyperparameter optimization process to decide the right values for
learning rate and number of epochs is depicted in the transformer_hyp_opt.py
file

The colab file used is at
https://colab.research.google.com/drive/1isI2ZvCre-J-1PXk-EGI4vdYyNd1fcrr?usp=sharing
"""

import pandas as pd
from simpletransformers.classification import ClassificationModel, ClassificationArgs
from sklearn.model_selection import train_test_split, StratifiedKFold
import sklearn

# get data
df = pd.read_csv('../data/train.csv')
test = pd.read_csv("../data/test.csv")
sub = pd.read_csv("../data/sampleSubmission.csv")

# stratified cross validation
df['kfold'] = -1

target = df.attack.values
kf = StratifiedKFold(n_splits=5)
for fold, (trn_, val_) in enumerate(kf.split(X=df, y=target)):
    df.loc[val_, 'kfold'] = fold 

df = df.drop(['id'], axis=1)
df.columns = ['labels', 'text', 'kfold']

# use fold 0 as the only fold for training
# so essentially, this works like a holdout cross validation technique
# using a 5-fold CV took a great computational toll on the machine
train = df[df.kfold != 0]
val = df[df.kfold == 0]

# model configuration
# best hyperparams set after hyperparam optimization
# check ../models/hyp_opt_transformer.csv for the other possible options
model_args = ClassificationArgs()
model_args = {
    "num_train_epochs": 2,
    "learning_rate": 0.000017,
}

# model
model = ClassificationModel(
    "roberta", "roberta-base", args=model_args
)

# train
model.train_model(train, f1=sklearn.metrics.f1_score)

# validate
result, model_outputs, wrong_predictions = model.eval_model(val, f1=sklearn.metrics.f1_score)

# making a submission
test_comments = list(test['text'].values)
predictions, raw_outputs = model.predict(test_comments)
sub['attack'] = predictions
sub.to_csv("transformer_submission.csv", index=False)