In [None]:
# Remove warnings
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd  
import matplotlib.pyplot as plt
%matplotlib inline 

import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

import re

from nltk.tokenize import sent_tokenize # tokenizes sentences
from nltk.stem import PorterStemmer
from nltk.tag import pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

eng_stopwords = stopwords.words('english')

import bs4 as bs

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

from google.colab import drive
drive.mount('/content/drive')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
Mounted at /content/drive


## Import Data

In [None]:
train = pd.read_csv("./drive/My Drive/Data-X: GGWP Toxic Behavior Public Data/data/train[1].csv")
# test = pd.read_csv("./drive/My Drive/Data-X: GGWP Toxic Behavior Public Data/data/test[1].csv")
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


## Data Processing

In [None]:
# Removes HTML tags (using beautifulsoup)
# Extract emoticons (aka smileys :D )
# Removes non-letters (using regular expression)
# Converts all words to lowercase letters and tokenizes them
# Removes all the English stopwords from the list of movie review words
# Join the words back into one string seperated by space, append the emoticons to the end
def review_cleaner(review):
    review = bs.BeautifulSoup(review).text
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', review)
    review = re.sub("[^a-zA-Z]", " ",review)
    review = review.lower().split()
    eng_stopwords = set(stopwords.words("english"))
    review = [w for w in review if not w in eng_stopwords]
    review = ' '.join(review+emoticons)
    return(review)

# get cleaned dataset

num_reviews = len(train['comment_text'])

review_clean_original = []

for i in range(0,num_reviews):
    if( (i+1)%500 == 0 ):
        # print progress
        print("Done with %d reviews" %(i+1)) 
    review_clean_original.append(review_cleaner(train['comment_text'][i]))

Done with 500 reviews
Done with 1000 reviews
Done with 1500 reviews
Done with 2000 reviews
Done with 2500 reviews
Done with 3000 reviews
Done with 3500 reviews
Done with 4000 reviews
Done with 4500 reviews
Done with 5000 reviews
Done with 5500 reviews
Done with 6000 reviews
Done with 6500 reviews
Done with 7000 reviews
Done with 7500 reviews
Done with 8000 reviews
Done with 8500 reviews
Done with 9000 reviews
Done with 9500 reviews
Done with 10000 reviews
Done with 10500 reviews
Done with 11000 reviews
Done with 11500 reviews
Done with 12000 reviews
Done with 12500 reviews
Done with 13000 reviews
Done with 13500 reviews
Done with 14000 reviews
Done with 14500 reviews
Done with 15000 reviews
Done with 15500 reviews
Done with 16000 reviews
Done with 16500 reviews
Done with 17000 reviews
Done with 17500 reviews
Done with 18000 reviews
Done with 18500 reviews
Done with 19000 reviews
Done with 19500 reviews
Done with 20000 reviews
Done with 20500 reviews
Done with 21000 reviews
Done with 21

In [None]:
def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return 'n'

# get Lemmatized data
  # Lemmatization (paraphrased from Wiki): 
  # Lemmatization is the process of grouping together the inflected forms of a word so they can be analysed as a single item, 
  # identified by the word's lemma, or dictionary form.

review_clean_wnl = []

wnl = WordNetLemmatizer()

for i in range(0,num_reviews):
    if( (i+1)%500 == 0 ):
        # print progress
        print("Done with %d reviews" %(i+1)) 
    
    wnl_stems = []
    token_tag = pos_tag(review_clean_original[i].split())
    for pair in token_tag:
        res = wnl.lemmatize(pair[0],pos=get_wordnet_pos(pair[1]))
        wnl_stems.append(res)

    review_clean_wnl.append(' '.join(wnl_stems))

Done with 500 reviews
Done with 1000 reviews
Done with 1500 reviews
Done with 2000 reviews
Done with 2500 reviews
Done with 3000 reviews
Done with 3500 reviews
Done with 4000 reviews
Done with 4500 reviews
Done with 5000 reviews
Done with 5500 reviews
Done with 6000 reviews
Done with 6500 reviews
Done with 7000 reviews
Done with 7500 reviews
Done with 8000 reviews
Done with 8500 reviews
Done with 9000 reviews
Done with 9500 reviews
Done with 10000 reviews
Done with 10500 reviews
Done with 11000 reviews
Done with 11500 reviews
Done with 12000 reviews
Done with 12500 reviews
Done with 13000 reviews
Done with 13500 reviews
Done with 14000 reviews
Done with 14500 reviews
Done with 15000 reviews
Done with 15500 reviews
Done with 16000 reviews
Done with 16500 reviews
Done with 17000 reviews
Done with 17500 reviews
Done with 18000 reviews
Done with 18500 reviews
Done with 19000 reviews
Done with 19500 reviews
Done with 20000 reviews
Done with 20500 reviews
Done with 21000 reviews
Done with 21

# Train Model

In [None]:
def predict_sentiment(cleaned_reviews, y=train["toxic"]):

    print("Creating the bag of words model!\n")
    # words to vector
    vectorizer = CountVectorizer(analyzer = "word",
                                 tokenizer = None,
                                 preprocessor = None,
                                 stop_words = None,
                                 max_features = 2000) 
    
    # Cannot really understand our test dataset so I still split the test set from the train set
    X_train, X_test, y_train, y_test = train_test_split(cleaned_reviews, y, random_state=0, test_size=.2)
    
    train_bag = vectorizer.fit_transform(X_train).toarray()
    test_bag = vectorizer.transform(X_test).toarray()

    print("Training the random forest classifier!\n")
    # Initialize a Random Forest classifier with 75 trees
    forest = RandomForestClassifier(n_estimators = 50) 
    forest = forest.fit(train_bag, y_train)

    train_predictions = forest.predict_proba(train_bag)
    test_predictions = forest.predict_proba(test_bag)
    
    train_auc = metrics.roc_auc_score(y_train, train_predictions[:, 1])
    valid_auc = metrics.roc_auc_score(y_test, test_predictions[:, 1])
    print("The training auc is: ", train_auc, "\n", "The validation auc is: ", valid_auc)
    
    return(forest,vectorizer)

# Output

In [None]:
print('Original Reviews')
forest1,vec1 = predict_sentiment(review_clean_original)

Original Reviews
Creating the bag of words model!

Training the random forest classifier!

The training accuracy is:  0.9989319545427238 
 The validation accuracy is:  0.9200282155291643


In [None]:
print('Lemmatizing')
forest3,vec3 = predict_sentiment(review_clean_wnl)

Lemmatizing
Creating the bag of words model!

Training the random forest classifier!

The training accuracy is:  0.9993001152483654 
 The validation accuracy is:  0.9341516540373583


# Get Validation Score

## Preprocess Validation Data

In [None]:
val_data = pd.read_csv('./drive/My Drive/Data-X: GGWP Toxic Behavior Public Data/data/combined.csv')

In [None]:
val_clean = []

num_reviews = len(val_data)

for i in range(0,num_reviews):
    if( (i+1)%500 == 0 ):
        # print progress
        print("Done with %d reviews" %(i+1)) 
    val_clean.append(review_cleaner(val_data['text'][i]))

val_clean_wnl = []

wnl = WordNetLemmatizer()

for i in range(0,num_reviews):
    if( (i+1)%500 == 0 ):
        # print progress
        print("Done with %d reviews" %(i+1)) 
    
    wnl_stems = []
    token_tag = pos_tag(val_clean[i].split())
    for pair in token_tag:
        res = wnl.lemmatize(pair[0],pos=get_wordnet_pos(pair[1]))
        wnl_stems.append(res)

    val_clean_wnl.append(' '.join(wnl_stems))

Done with 500 reviews
Done with 1000 reviews
Done with 1500 reviews
Done with 2000 reviews
Done with 2500 reviews
Done with 500 reviews
Done with 1000 reviews
Done with 1500 reviews
Done with 2000 reviews
Done with 2500 reviews


In [None]:
val_bag = vec3.transform(val_clean_wnl).toarray()

## Get Validation Score

In [None]:
classes = ['toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate']

In [None]:
def train_validate(train_bag, val_bag, y_train, y_val, class_name):
    """
    Trains a random forest on the class and returns the auc score of the validation data

    """
    print("Training the random forest classifier for", class_name, "!\n")
    # Initialize a Random Forest classifier with 50 trees
    forest = RandomForestClassifier(n_estimators = 50) 
    forest = forest.fit(train_bag, y_train)

    val_predictions = forest.predict_proba(val_bag)
    
    val_auc = metrics.roc_auc_score(y_val, val_predictions[:, 1])
    print("The validation auc for", class_name, "is: ", val_auc, "\n")
    
    return val_auc

In [None]:
train_bag = vec3.transform(review_clean_wnl)
aucs = []
# Goes through each class
for class_name in classes:
    # Gets the validation score for each class
    val_auc = train_validate(train_bag, val_bag, train[class_name], val_data[class_name], class_name)
    aucs.append(val_auc)

## Add Random Forest Average AUC to Model Results CSV

In [None]:
# model_results = pd.read_csv('./drive/My Drive/Data-X: GGWP Toxic Behavior Public Data/models/model_results.csv', index_col='Unnamed: 0')
# model_results = pd.concat([model_results, pd.DataFrame({'Model': ['Random Forest'], 'val_auc_score': [np.mean(aucs)]})])
# model_results.to_csv('./drive/My Drive/Data-X: GGWP Toxic Behavior Public Data/models/model_results.csv')