# Word features only on toxic

The idea is to construct word features with tf-idf and word2vec and then train a simple model such as logistic regression. 

## Quick look at the shape of the data

In [None]:
import pandas as pd

data = pd.read_csv("./data/train.csv", usecols=["id", "comment_text", "toxic"])

In [None]:
data.shape

In [None]:
data.describe()

## Evaluation of model on split train dataset

### Train/test split

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=543553)

In [None]:
trainIndex, testIndex = list(split.split(data, data.toxic))[0]

In [None]:
train, test = data.iloc[trainIndex], data.iloc[testIndex]

### Word2Vec

In [None]:
def splitSentences(dataset):
    return (dataset.comment_text
    .str.replace("[^A-Za-z\s]", "")
    .str.lower()
    .str.split())

In [None]:
%%time
splitTrain = splitSentences(train)
splitTest = splitSentences(test)

In [None]:
%%time
import gensim

model = gensim.models.KeyedVectors.load_word2vec_format(
    "/home/mariosk/Documents/common-ml-models/GoogleNews-vectors-negative300.bin", 
    binary=True)  

In [None]:
%%time
vocabulary = set(model.wv.vocab.keys())

In [None]:
%%time
def wordsToVector(words):
    allowedWords = [word for word in words if word in vocabulary]
    
    return model.wv[allowedWords if allowedWords else ["hello"]].mean(axis=0)

In [None]:
import numpy as np

In [None]:
%%time
trainFeatures = np.array(splitTrain.apply(wordsToVector).tolist())

In [None]:
%%time
testFeatures = np.array(splitTest.apply(wordsToVector).tolist())

### Tf-idf features

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer().fit(train.comment_text)

In [None]:
trainFeatures = tfidf.transform(train.comment_text)

In [None]:
testFeatures = tfidf.transform(test.comment_text)

## Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression().fit(trainFeatures, train.toxic)

In [None]:
testPredictions = model.predict(testFeatures)

In [None]:
testProbPredictions = model.predict_proba(testFeatures)

In [None]:
import numpy as np
import scipy.stats as stats

def cramersV(contmat):
    nrow, ncol = contmat.shape
    nobs = np.sum(contmat.sum())
    chi2, pvalue, dof, expected = stats.chi2_contingency(contmat)
    n = np.min([nrow - 1, ncol - 1])
    v = np.sqrt(chi2 / (nobs * n))
    return np.array([v, pvalue])

In [None]:
from sklearn.metrics import confusion_matrix, f1_score, log_loss

def getMetrics(label, predictions, probPredictions):
    return {"Confusion Matrix": pd.DataFrame(
                data=confusion_matrix(label, predictions),
                index=["T Neutral", "T Toxic"],
                columns=["P Neutral", "P Toxic"]), 
            "Relativized Confusion Matrix": pd.DataFrame(
                data=confusion_matrix(label, predictions) / float(len(predictions)),
                index=["T Neutral", "T Toxic"],
                columns=["P Neutral", "P Toxic"]),
            "F1 score": round(f1_score(label, predictions, pos_label=1.0), 3),
            "Logarithmic loss": round(log_loss(label, probPredictions), 4),
            "Cramer's V": cramersV(confusion_matrix(label, predictions)) }

In [None]:
metrics = getMetrics(test.toxic, testPredictions, testProbPredictions)

In [None]:
np.set_printoptions(precision=4, suppress=True)

In [None]:
from IPython.display import HTML, display

def printMetrics(metrics):
    for (name, value) in metrics.items():
        display(HTML("<div style='font-weight:bold'>{} :</div>".format(name)))
        print(value)

### Print metrics for test dataset

In [None]:
printMetrics(metrics)

### Print metrics for train dataset

In [None]:
trainPredictions = model.predict(trainFeatures)
trainProbPredictions = model.predict_proba(trainFeatures)

printMetrics(getMetrics(train.toxic, trainPredictions, trainProbPredictions))

### Play with the model

In [None]:
def predict(sentence):
    return round(model.predict_proba(tfidf.transform([sentence]))[0][1], 3)

In [None]:
predict("test")