In [None]:
import pandas as pd
import numpy as np
import re, string
import nltk
import pickle
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

df = pd.read_csv('/Users/shanecooke/Desktop/Official GitLab/CompleteData.csv')
df

In [None]:
def preProcessText(text):
    text = text.lower() 
    text = text.strip()  
    text = re.compile('<.*?>').sub('', text) 
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)  
    text = re.sub('\s+', ' ', text)  
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d',' ',text) 
    text = re.sub(r'\s+',' ',text) 
    return text

def stopwordRemoval(string):
    stop = [i for i in string.split() if i not in stopwords.words('english')]
    return ' '.join(stop)

wl = WordNetLemmatizer()

def tagMapping(tag):
    if tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
def lemmatization(string):
    words = nltk.pos_tag(word_tokenize(string))
    temp = [wl.lemmatize(tag[0], tagMapping(tag[1])) for idx, tag in enumerate(words)]
    return " ".join(temp)

def finalCleaning(string):
    return lemmatization(stopwordRemoval(preProcessText(string)))

df['clean_text'] = df['Comment'].apply(lambda x: finalCleaning(x))
df

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import texthero as hero
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.feature_extraction.text import HashingVectorizer
from gensim.models import Word2Vec
import tensorflow_hub as hub

df['tfidf'] = (hero.tfidf(df['clean_text'], max_features=3000))

card_docs = [TaggedDocument(doc.split(' '), [i]) for i, doc in enumerate(df.clean_text)]
model = Doc2Vec(vector_size=64, window=2, min_count=1, workers=8, epochs = 40)
model.build_vocab(card_docs)
model.train(card_docs, total_examples=model.corpus_count, epochs=model.epochs)
temp = [model.infer_vector((df['clean_text'][i].split(' '))) for i in range(0,len(df['clean_text']))]
dtv = np.array(temp).tolist()
df['Doc2Vec'] = dtv

vectorizer = HashingVectorizer(n_features=500)
hashed = vectorizer.transform(df['clean_text']).toarray()
hashList = np.array(hashed).tolist()
df['hashing'] = hashList

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
embeddings = embed(df['clean_text'])
use = np.array(embeddings).tolist()
df['USE'] = use

df

In [None]:
randomForest_grid = {'n_estimators':[200,400,600,800,1000], 'criterion':['gini', 'entropy']}
decisionTree_grid = {'max_depth':[2,4,6,8], 'splitter':['best', 'random'], 'criterion':['gini', 'entropy']}
svc_grid = {'kernel':['linear', 'poly', 'rbf', 'sigmoid']}
adaboost_grid = {'n_estimators':[50,100,150,200], 'algorithm':['SAMME', 'SAMME.R']}
mlp_grid = {'max_iter':[500,1000,1500], 'activation':['identity', 'logistic', 'tanh', 'relu']}
linearDis_grid = {'solver':['svd', 'lsqr', 'eigen']}

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.metrics import precision_score, recall_score
from xgboost import XGBClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from statistics import mean
from sklearn.model_selection import GridSearchCV

X, y = df.USE.tolist(), df.Hateful

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

classifier = RandomForestClassifier()
#classifier = DecisionTreeClassifier()
#classifier = GaussianNB()
#classifier = SVC()
#classifier = AdaBoostClassifier()
#classifier = GaussianProcessClassifier()
#classifier = KNeighborsClassifier()
#classifier = MLPClassifier()
#classifier = XGBClassifier()
#classifier = LinearDiscriminantAnalysis()

grid = GridSearchCV(classifier, randomForest_grid, n_jobs=-1, verbose=1)
grid.fit(X_train, y_train)

print(grid.best_params_)
grid_predictions = grid.predict(X_test)

print(classification_report(y_test, grid_predictions))

print("\nPrecision: ", precision_score(y_test, grid_predictions, average=None))
print("Recall: ", recall_score(y_test, grid_predictions, average=None))
print("F1 Score: ", f1_score(y_test, grid_predictions, average=None))
print("Accuracy: ", accuracy_score(y_test, grid_predictions))