# CS 6120 - Natural Language Processing
#### Author - Shubhanshu Gupta

In [None]:
pip install emoji --upgrade

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import string
import emoji
from wordcloud import WordCloud
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from lightgbm import LGBMClassifier
from nltk import word_tokenize
from sklearn import metrics,preprocessing
import warnings
warnings.filterwarnings('ignore')
import pickle as pk
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation,SpatialDropout1D,Bidirectional,GRU,GlobalMaxPooling1D
from keras.layers.embeddings import Embedding
from keras.initializers import Constant
from keras.optimizers import  Adam
import keras
import gensim
import os
from keras.callbacks import ModelCheckpoint 
from keras.callbacks import Callback,ReduceLROnPlateau
from sklearn import metrics,preprocessing
from keras.models import load_model
import os

In [None]:
train_df = pd.read_csv("train.csv")

In [None]:
### Finding the years in data
def convert_year(year):
  yr = year.split('-')[0]
  return int(yr)
train_df['year'] = train_df['created_date'].apply(lambda x: int(x.split('-')[0]))

In [None]:
### Converting target to boolean form
train_df['bool_target'] = train_df['target'].apply(lambda x: 1 if x >= 0.5 else 0)

In [None]:
### Dropping columns
train_df.drop(['id','created_date','publication_id','parent_id','article_id','identity_annotator_count','toxicity_annotator_count'],axis = 1,inplace = True)

In [None]:
class TextProcessing:
    
    EMOJI_REGEXP = emoji.get_emoji_regexp()

    UNICODE_EMOJI_MY = {
        k: f" EMJ {v.strip(':').replace('_', ' ')} "
        for k, v in emoji.UNICODE_EMOJI_ALIAS.items()
    }
    
    REGEX_REPLACER = [
        (re.compile(pat.replace("*", "\*"), flags=re.IGNORECASE), repl)
        for pat, repl in WORDS_REPLACER
    ]
    
    
    ## Removing html elemnets from the descriptions and features
    def cleanhtml(self,raw_html):
        cleanr = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
        cleantext = re.sub(cleanr, '', raw_html)
        return cleantext
    
    ## Clean numbers from the text
    def clean_numbers(x):
        return re.sub(r'\d+', ' ', x)
    
    # Removing punctuations from the text
    def remove_punctuations(self,text):
        text = re.sub(r'[^a-zA-z\s]', '', text)
        return text
    
    # Removing all special_characters except english alphabets with option to remove digits
    def remove_special_characters(self,text, remove_digits=False):
        pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
        text = re.sub(pattern, '', text)
        return text
    
    # replacing given text to given pattern 
    def replace_regex(self,text,pattern,replacement):
        return re.sub(pattern,replacement,text)
    
    ## Replacing Emoji with text
    def my_demojize(self,string):
        def replace(match):
            return UNICODE_EMOJI_MY.get(match.group(0), match.group(0))

        return re.sub("\ufe0f", "", EMOJI_REGEXP.sub(replace, string))
    
    ## Replacing starred words in the text
    def replace_starred(self,text):
        for pattern, repl in REGEX_REPLACER:
            text = pattern.sub(repl, text)
        return text
      
    # expandcontractions such as isn't to is not
    def expand_contractions(self,text, contraction_mapping=CONTRACTION_MAP):
    
        contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
        def expand_match(contraction):
            match = contraction.group(0)
            first_char = match[0]
            expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
            expanded_contraction = first_char+expanded_contraction[1:]
            return expanded_contraction
        
        expanded_text = contractions_pattern.sub(expand_match, text)
        expanded_text = re.sub("'", "", expanded_text)
        return expanded_text
        
    # Splitting joined english words like 'themoney' to 'the money'
    def split_words(self,text):
        tokens = word_tokenize(text)
        tokens = [token.strip() for token in tokens]
        filtered_tokens = [' '.join(wordninja.split(token)) for token in tokens]        
        filtered_text = ' '.join(filtered_tokens)    
        return filtered_text
    
    ## Function used for text preprocessing
    def text_preprocessing(self,text):
        text = self.cleanhtml(text)
        text = text.lower()
        text = self.expand_contractions(text)
        text = self.my_demojize(text)
        text = replace_regex(text,"\\S+@\\S+","")
        text = replace_regex(text,"[Hh]ttp([^ ]+)","")
        text = replace_regex(text,"RT | via","")
        text = replace_regex(text,"@([^ ]+)","")
        text = replace_regex(text,"[Ww]ww([^ ]+)","")
        text = replace_regex(text,"[@][a - zA - Z0 - 9_]{1,15}","")
        text = self.replace_starred(text)
        text = self.remove_special_characters(text)
        text = self.clean_numbers(text)
        text = re.sub(' +', ' ', text)
        # stripping extra space
        text = text.strip()
        text = text.lower()

        return text
    
text_processing = TextProcessing()

In [None]:
train_df['comment_text'] = train_df['comment_text'].apply(lambda text: text_processing.text_preprocessing(text))

### Exploratory Data Analysis

In [None]:
## Plotting percentage of toxic and non-toxic comments

ax = sns.countplot(train_df['bool_target'], palette='Set3')
for p in ax.patches:
  height = p.get_height()
  ax.text(p.get_x()+p.get_width()/2.,height + 3,'{:1.2f}%'.format(100*height/float(len(train_df))),ha="center") 
plt.show()

In [None]:
stopwrds = set(stopwords.words("english"))

### Function to plot word clouds
def show_wordcloud(data, title = None):
    wordcloud = WordCloud(
        background_color='black',
        stopwords=stopwrds,
        max_words=70,
        max_font_size=30, 
        scale=5,
        random_state=1
    ).generate(str(data))

    fig = plt.figure(1, figsize=(10,10))
    plt.axis('off')
    if title: 
        fig.suptitle(title, fontsize=20)
        fig.subplots_adjust(top=2.3)

    plt.imshow(wordcloud)
    plt.show()

In [None]:
## Wordcloud on full text
show_wordcloud(train_df['comment_text'], title = 'Most common Words in training corpus')

In [None]:
## Wordcloud on toxic comments
show_wordcloud(train_df[train_df['bool_target'] == 1]['comment_text'], title = 'Most common Words in training corpus with toxic comments')

In [None]:
## Wordcloud on  non-toxic comments
show_wordcloud(train_df[train_df['bool_target'] == 0]['comment_text'], title = 'Most common Words in training corpus with toxic comments')

In [None]:
### Number of comments in eachyear
sns.countplot(x="year", hue="bool_target", data=train_df)

In [None]:
### Plotting percentage of toxic comments related to different identities
demographics = orig_train.loc[:, ['target']+list(orig_train)[slice(7,31)]].dropna()
weighted_toxic = demographics.iloc[:, 1:].multiply(demographics.iloc[:, 0], axis="index").sum()/demographics.iloc[:, 1:][demographics.iloc[:, 1:]>0].count()
weighted_toxic = weighted_toxic.sort_values(ascending=False)
plt.figure(figsize=(30,20))
sns.set(font_scale=3)
ax = sns.barplot(x = weighted_toxic.values, y = weighted_toxic.index, alpha=0.8)
plt.ylabel('Demographics')
plt.xlabel('Weighted Toxic')
plt.title('Percent of toxic comments related to different identities')
plt.show()

### Evaluation metric

In [None]:
### Functions to create evaluation metric
def evaluation_metric_subgroups(df,groups):
    if groups == 'All': 
        #These are the whole identities 
        categoriese = ['physical_disability','psychiatric_or_mental_illness','jewish','asian','homosexual_gay_or_lesbian','black','muslim','white','christian','female','male']
    else:
        categoriese = groups
    categoriese_df = pd.DataFrame(columns = ['SUB','BPSN','BNSP'], index = categoriese)

    for category in categoriese:
        #change it to 0 or 1 rather than probabilities
        #if the identity is mentioned or not
        #if the category is NA, treated it as 0
        df[category] = df[category] >= 0.5
        #calculate the subgroup AUC
        #it is possible that there is no data, then we will just assign each value to be 0
        if df[df[category]].shape[0] == 0:
            #remove the entire row
             categoriese_df = categoriese_df.drop(category, axis = 0)
        else:
            categoriese_df.loc[category,'SUB'] = auc(df[df[category]])
            bpsn = ((~df[category] & df['bool_target'])    #background positive
                | (df[category] & ~df['bool_target'])) #subgroup negative
            categoriese_df.loc[category,'BPSN'] = auc(df[bpsn])
            bnsp = ((~df[category] & ~df['bool_target'])   #background negative
                | (df[category] & df['bool_target']))  #subgrooup positive
            categoriese_df.loc[category,'BNSP'] = auc(df[bnsp])
    #drop rows that contain NANs due to insufficient data
    categoriese_df = categoriese_df.dropna(axis = 0)

    #Apply the power function defined before
    categoriese_df.loc['Mp',:] = categoriese_df.apply(Mp, axis= 0)

    return categoriese_df

def Mp(data, p = -5.0):
    return np.average(data ** p) ** (1/p)

def auc(df):
    y_orig = df['bool_target']
    y_pred = df['prediction'] 
    fpr, tpr, thresholds = roc_curve(y_orig,y_pred)
    return metrics.auc(fpr, tpr)

def final_metric(df,groups= 'All'):
    #First of all, swtich the probability to labels
    y_orig = df['bool_target']
    y_pred = df['prediction']  
    w0,w1,w2,w3 = 0.25,0.25,0.25,0.25

    #Next caluclate the overall arc and the Mp of each sub-metrics
    overall = auc(df)
    categoriese_df = evaluation_metric_subgroups(df,groups)
    final_metric = w0 * overall + w1 * categoriese_df.loc['Mp','SUB'] + w2 * categoriese_df.loc['Mp','BPSN'] + w3 * categoriese_df.loc['Mp','BNSP']

    return final_metric

In [None]:
# Loading sampled data
train_df = pd.read_csv('model_train.csv')
test_df = pd.read_csv('model_test.csv')

### Using TF-IDF

In [None]:
word_vectorizer = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1)
word_vectorizer.fit(complete_text)
train_text_split = train_df['comment_text'].apply(lambda x: np.str(x))
test_text_split = test_df['comment_text'].apply(lambda x: np.str(x))
y_orig = train_df['bool_target']
train_features = word_vectorizer.transform(train_text_split)
test_features = word_vectorizer.transform(test_text_split)

### Using Smote and RandomSampler for creating data balance

In [None]:
from imblearn.pipeline import make_pipeline
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
from imblearn.over_sampling import SMOTE

print(Counter(y_orig))
train_features = word_vectorizer.transform(train_text_split)

sampler = make_pipeline(
    RandomUnderSampler(random_state=0, sampling_strategy=0.2),
    SMOTE(random_state=0)
)
train_features,y_orig = sampler.fit_resample(train_features,y_orig)   

print(Counter(y_orig))

test_features = word_vectorizer.transform(test_text_split)

### Logistic Regression with TF-IDF

In [None]:
classifier = LogisticRegression(solver='saga', random_state = 10,n_jobs= 8)
classifier.fit(train_features,y_orig)
pred_target = classifier.predict_proba(test_features)[:, 1]
prediction_log = {'id': test_df.index}
prediction_log['prediction'] = classifier.predict_proba(test_features)[:, 1]
prediction_log = pd.DataFrame.from_dict(prediction_log)
prediction_log['prediction'] = prediction_log['prediction'].apply(lambda x: 1 if x >= 0.5 else 0)
merged_log = test_df.merge(prediction_log, left_index=True, right_index=True).fillna(0)

# Printing the results
print(f'Final Metric: {final_metric(merged_log)}')

### Naive Bayes with Tf-IDF

In [None]:
classifier_NB = MultinomialNB()
classifier_NB.fit(train_features,y_orig)
prediction_nb = {'id': test_df.index}
prediction_nb['prediction'] = classifier_NB.predict_proba(test_features)[:, 1]
prediction_nb = pd.DataFrame.from_dict(prediction_nb)
prediction_nb['prediction'] = prediction_nb['prediction'].apply(lambda x: 1 if x >= 0.5 else 0)
merged_nb = test_df.merge(prediction_nb, left_index=True, right_index=True).fillna(0)

# Printing the results
print(f'Final Metric: {final_metric(merged_nb)}')

### Light Gradient Boosting with Tf- IDF

In [None]:
classifier_LGBM = LGBMClassifier(n_jobs = 8)
classifier_LGBM.fit(train_features,y_orig)
prediction_lgbm = {'id': test_df.index}
prediction_lgbm['prediction'] = classifier_LGBM.predict_proba(test_features)[:, 1]
prediction_lgbm = pd.DataFrame.from_dict(prediction_lgbm)
prediction_lgbm['prediction'] = prediction_lgbm['prediction'].apply(lambda x: 1 if x >= 0.5 else 0)
merged_lgbm = test_df.merge(prediction_lgbm, left_index=True, right_index=True).fillna(0)

# Printing the results
print(f'Final Metric: {final_metric(merged_lgbm)}')

## Deep Learning Models

In [None]:
## Padding and creating data for Glove Embedding layer and for LSTM models 

X_train = train_df['comment_text'].values
y_train = train_df['bool_target'].values
X_test = test_df['comment_text'].values
y_test = test_df['bool_target'].values

total_comment = np.concatenate([X_train, X_test])

max_length = max([len(com.split()) for com in total_comment])
max_length

In [None]:
vocabulary_size = 20000
tokenizer = Tokenizer(num_words= vocabulary_size)
tokenizer.fit_on_texts(total_comment)

X_train_tokens = tokenizer.texts_to_sequences(X_train)
X_test_tokens = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_tokens, maxlen=max_length)
X_test_pad = pad_sequences(X_test_tokens, maxlen=max_length)

In [None]:
## Creating functions for metrics
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
metrics = [
    keras.metrics.Precision(name="precision"),
    keras.metrics.Recall(name="recall"),
    keras.metrics.AUC(name="auc"),
    f1_m
]

In [None]:
### Creating callback to save best model based on our metric
global_score = 0

from keras.callbacks import EarlyStopping
#pd.options.mode.chained_assignment = None

class RocAucEvaluation(keras.callbacks.Callback):
  
  def __init__(self, model, validation_data=()):

    #kwargs['monitor'] = 'val_bias_metric'
        
    self.model = model
    self.X_val, self.y_val = validation_data
        #super().__init__(*args, **kwargs)
            
  def on_epoch_end(self, epoch,logs = {}):
    global global_score
    prediction = {'id': test_df.index}
    prediction['prediction'] = self.model.predict(self.X_val,batch_size=256)[:,0]
    prediction = pd.DataFrame.from_dict(prediction)
    prediction['prediction'] = prediction['prediction'].apply(lambda x: 1 if x >= 0.5 else 0)
    merged = test_df.merge(prediction, left_index=True, right_index=True).fillna(0)
    score = final_metric(merged)
    
    if score > global_score:
      model_json = model.to_json()
      with open("final.json", "w") as json_file:
        json_file.write(model_json)
      model.save("final.h5")
      print("\n Saved model to disk")
      global_score = score

    logs['val_bias_metric'] = score
    print("\n ROC-AUC - score: {:.6f}".format(score))
    return score

In [None]:
!wget https://usmlproject.s3.amazonaws.com/glove.42B.300d.txt

In [None]:
## Creating embedding details

embeddings_index = dict()
f = open('glove.6B.300d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [None]:
embedding_matrix = np.zeros((vocabulary_size, 300))
for word, index in tokenizer.word_index.items():
    if index > vocabulary_size - 1:
        break
    else:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector

In [None]:
### Calculating weights for each class
counts = np.bincount(y_train)
weight_for_0 = 1.0 / counts[0]
weight_for_1 = 1.0 / counts[1]

#print(weight_for_0,weight_for_1)
class_weight = {0: weight_for_0, 1: weight_for_1}

### Fitting models

#### 1) Two LSTM layers

In [None]:
EMBEDDING_DIM = 300

model = Sequential()
model.add(Embedding(vocabulary_size,EMBEDDING_DIM,input_length=max_length, weights=[embedding_matrix], trainable=False))
model.add(Dropout(0.2))
model.add(LSTM(128,return_sequences=True,dropout=0.2))
model.add(LSTM(128,dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss = 'binary_crossentropy', optimizer = keras.optimizers.Adam(1e-2),metrics=metrics)

es = RocAucEvaluation(model, validation_data=(X_test_pad, y_test))
rlrp = ReduceLROnPlateau(monitor='val_auc', factor=0.2, patience=3, min_delta=1E-7)

model.fit(
    X_train_pad,
    y_train,
    batch_size=256,
    epochs=30,
    verbose=1,
    validation_data=(X_test_pad, y_test),
    class_weight=class_weight,
    callbacks=[es,rlrp]
)

#### 2) Two LSTM layer with CNN 

In [None]:
model = Sequential()
model.add(Embedding(vocabulary_size,EMBEDDING_DIM,input_length=max_length, weights=[embedding_matrix], trainable=False))
model.add(Dropout(0.2))
model.add(LSTM(128,return_sequences=True,dropout=0.2))
model.add(LSTM(128,dropout=0.2))
model.add(Conv1D(64, kernel_size = 5, padding = "valid", kernel_initializer = "he_uniform"))
model.add(GlobalMaxPooling1D())
model.add(Dense(1, activation='sigmoid'))

model.compile(loss = 'binary_crossentropy', optimizer = keras.optimizers.Adam(1e-2),metrics=metrics)

model.fit(
    X_train_pad,
    y_train,
    batch_size=256,
    epochs=30,
    verbose=1,
    validation_data=(X_test_pad, y_test),
    class_weight=class_weight,
    callbacks=[es,rlrp]
)

### Bidirectional LSTM

In [None]:
model = Sequential()
model.add(Embedding(vocabulary_size,EMBEDDING_DIM,input_length=max_length, weights=[embedding_matrix], trainable=False))
model.add(SpatialDropout1D(0.2))
model.add(Bidirectional(LSTM(128,dropout=0.2, return_sequences=True)))
model.add(Bidirectional(LSTM(64,dropout=0.2, return_sequences=True)))
# model.add(Conv1D(64, kernel_size = 2, padding = "valid", kernel_initializer = "he_uniform"))
# model.add(GlobalMaxPooling1D())
model.add(Dense(1, activation='sigmoid'))

model.compile(loss = 'binary_crossentropy', optimizer = keras.optimizers.Adam(1e-2),metrics=metrics)

model.fit(
    X_train_pad,
    y_train,
    batch_size=256,
    epochs=30,
    verbose=1,
    validation_data=(X_test_pad, y_test),
    class_weight=class_weight,
    callbacks=[es,rlrp]
)

### Bidirectional LSTM with CNN

In [None]:
model = Sequential()
model.add(Embedding(vocabulary_size,EMBEDDING_DIM,input_length=max_length, weights=[embedding_matrix], trainable=False))
model.add(SpatialDropout1D(0.2))
model.add(Bidirectional(LSTM(128,dropout=0.2, return_sequences=True)))
model.add(Bidirectional(LSTM(64,dropout=0.2, return_sequences=True)))
model.add(Conv1D(64, kernel_size = 2, padding = "valid", kernel_initializer = "he_uniform"))
model.add(GlobalMaxPooling1D())
model.add(Dense(1, activation='sigmoid'))

model.compile(loss = 'binary_crossentropy', optimizer = keras.optimizers.Adam(1e-2),metrics=metrics)

model.fit(
    X_train_pad,
    y_train,
    batch_size=256,
    epochs=30,
    verbose=1,
    validation_data=(X_test_pad, y_test),
    class_weight=class_weight,
    callbacks=[es,rlrp]
)

#### Bidirectional GRU with CNN

In [None]:
model = Sequential()
model.add(Embedding(vocabulary_size,EMBEDDING_DIM,input_length=max_length, weights=[embedding_matrix], trainable=False))
model.add(SpatialDropout1D(0.2))
model.add(Bidirectional(GRU(128,dropout=0.2, return_sequences=True)))
model.add(Bidirectional(GRU(64,dropout=0.2, return_sequences=True)))
model.add(Conv1D(64, kernel_size = 2, padding = "valid", kernel_initializer = "he_uniform"))
model.add(GlobalMaxPooling1D())
model.add(Dense(1, activation='sigmoid'))

model.compile(loss = 'binary_crossentropy', optimizer = keras.optimizers.Adam(1e-2),metrics=metrics)

model.fit(
    X_train_pad,
    y_train,
    batch_size=256,
    epochs=30,
    verbose=1,
    validation_data=(X_test_pad, y_test),
    class_weight=class_weight,
    callbacks=[es,rlrp]
)

### Prediction on final set (this is repeated for all above methods)

In [None]:
test_set = pd.read_csv('test.csv')
X_test_final = test_set['comment_text'].values
X_test_tokens_final = tokenizer.texts_to_sequences(X_test_final)
X_test_pad_final = pad_sequences(X_test_tokens_final, maxlen=max_length)

In [None]:
prediction = {'id': test_df.index}
prediction['prediction'] = model.predict(X_test_pad_final,batch_size=256)[:,0]
prediction = pd.DataFrame.from_dict(prediction)
prediction['prediction'] = prediction['prediction'].apply(lambda x: 1 if x >= 0.5 else 0)
merged = test_df.merge(prediction, left_index=True, right_index=True).fillna(0)
# Printing the results
print(f'Final Metric: {final_metric(merged)}')

Other methods that were tried were LSTM without embedding,SVM but it took too long to run even after dimensionality reduction using SVD
