# TF-IDF

The idea is use the simple approch of tf-idf using panda and sklearn. If training is slow, launch a large instance in aws to run an extensive grid search.

## Quick look at the shape of the data

In [None]:
import pandas as pd

data = pd.read_csv("./data/train.csv")

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.describe()

## Evaluation of model on split train dataset

### Train/test split

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=543553)

In [None]:
trainIndex, testIndex = list(split.split(data, data.toxic))[0]

In [None]:
train, test = data.iloc[trainIndex], data.iloc[testIndex]

### Word2Vec

In [None]:
def splitSentences(dataset):
    return (dataset.comment_text
    .str.replace("[^A-Za-z\s]", "")
    .str.lower()
    .str.split())

In [None]:
%%time
splitTrain = splitSentences(train)
splitTest = splitSentences(test)

In [None]:
%%time
import gensim

model = gensim.models.KeyedVectors.load_word2vec_format(
    "/home/mariosk/Documents/common-ml-models/GoogleNews-vectors-negative300.bin", 
    binary=True)  

In [None]:
%%time
vocabulary = set(model.wv.vocab.keys())

In [None]:
%%time
def wordsToVector(words):
    allowedWords = [word for word in words if word in vocabulary]
    
    return model.wv[allowedWords if allowedWords else ["hello"]]

In [None]:
%%time
trainFeatures = splitTrain.apply(wordsToVector)

In [None]:
%%time
testFeatures = splitTest.apply(wordsToVector)

### Tf-idf features

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer().fit(train.comment_text)

In [None]:
trainFeatures = tfidf.transform(train.comment_text)

In [None]:
testFeatures = tfidf.transform(test.comment_text)

## Models for each category

In [None]:
def getLabels(dataset, categories):
    return {category: dataset[category] for category in categories}

In [None]:
from sklearn.linear_model import LogisticRegression

def getModels(datasetFeatures, labelColumns):
    return {category: LogisticRegression().fit(datasetFeatures, column) 
            for (category, column) in labelColumns.items()}

In [None]:
def getPredictions(models, datasetFeatures):
    return {category: model.predict(datasetFeatures) for (category, model) in models.items()}

In [None]:
def getProbabilityPredictions(models, datasetFeatures):
    return {category: model.predict_proba(datasetFeatures) for (category, model) in models.items()}

In [None]:
categories = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

trainLabels = getLabels(train, categories)

models = getModels(trainFeatures, trainLabels)

In [None]:
predictions = getPredictions(models, testFeatures)

In [None]:
predictionProbabilities = getProbabilityPredictions(models, testFeatures)

In [None]:
testLabels = getLabels(test, categories)

In [None]:
import numpy as np
import scipy.stats as stats

def cramersV(contmat):
    '''Function to calculate cramers'V and 
    associated p-value using contingency matrix'''
    nrow, ncol = contmat.shape
    nobs = np.sum(contmat.sum())
    chi2, pvalue, dof, expected = stats.chi2_contingency(contmat)
    n = np.min([nrow - 1, ncol - 1])
    v = np.sqrt(chi2 / (nobs * n))
    return np.array([v, pvalue])

In [None]:
def getMetrics(datasetLabels, predictions, predictionProbabilities):
    return { category: {
            "Confusion Matrix": confusion_matrix(datasetLabels[category], predictions[category]), 
            "Relativized Confusion Matrix": confusion_matrix(datasetLabels[category], predictions[category]) / float(len(predictions[category])),
            "F1 score": round(f1_score(datasetLabels[category], predictions[category], pos_label=1.0), 3),
            "Logarithmic loss": round(log_loss(datasetLabels[category], predictionProbabilities[category]), 4),
            "Cramer's V": cramersV(confusion_matrix(datasetLabels[category], predictions[category]))}
        for category in categories }

In [None]:
from sklearn.metrics import confusion_matrix, f1_score, log_loss

metrics = getMetrics(testLabels, predictions, predictionProbabilities)

In [None]:
np.set_printoptions(precision=3, suppress=True)

In [None]:
def printMetrics(metrics):
    for (category, metricValues) in metrics.items():
        print("")
        print("Category: {}".format(category))
        print("-"*50)
        for (name, value) in metricValues.items():
            print(name + ":")
            print(value)

    print("-"*50 + "\n" + "-"*50 + "\n")
    print("Average")
    print("-"*50)
    for metric in metrics.items()[0][1].keys():
        print(metric + ":")
        print(sum([value[metric] for value in metrics.values()]) / float(len(metrics)))

### Print metrics for test dataset

In [None]:
printMetrics(metrics)

### Print metrics for train dataset

In [None]:
printMetrics(getMetrics(
    trainLabels, 
    getPredictions(models, trainFeatures), 
    getProbabilityPredictions(models, trainFeatures)))

### Play with the model

In [None]:
def predict(sentence):
    return { category: round(model.predict_proba(tfidf.transform([sentence]))[0][1], 3) for (category, model) in models.items()}

In [None]:
predict("dick")

## Train on all data and evaluate on the contest test data

### Tf-idf

In [None]:
import pandas as pd

contestTrain = pd.read_csv("./data/train.csv")

In [None]:
contentTest = pd.read_csv("./data/test.csv")

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

contestTfidf = TfidfVectorizer().fit(contestTrain.comment_text)

In [None]:
contestTrainFeatures = contestTfidf.transform(contestTrain.comment_text)

In [None]:
contestTestFeatures = contestTfidf.transform(contentTest.comment_text)

### Predictions

In [None]:
from sklearn.linear_model import LogisticRegression

categories = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
contestCategoryColumns = {category: contestTrain[category] for category in categories}

contestModels = {category: LogisticRegression().fit(contestTrainFeatures, column) for (category, column) in contestCategoryColumns.items()}

In [None]:
contestPredictionProbabilities = {category: model.predict_proba(contestTestFeatures) for (category, model) in contestModels.items()}

## Export result

In [None]:
suffledResult = pd.DataFrame(dict(
    [("id", contentTest.id)] 
    + [(name, preds[:, 1]) for (name, preds) in contestPredictionProbabilities.items()]))

result = suffledResult[["id"] + categories]

In [None]:
contentTest.head()

In [None]:
result.head()

In [None]:
pd.read_csv("./data/sample_submission.csv").head()

In [None]:
result.to_csv("./submissions/simple-tf-idf-without-exponents.csv", index=False)