# TF-IDF

The idea is use the simple approch of tf-idf using panda and sklearn. If training is slow, launch a large instance in aws to run an extensive grid search.

## Quick look at the shape of the data

In [None]:
import pandas as pd

data = pd.read_csv("./data/train.csv")

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.describe()

## Evaluation of model on split train dataset

### Train/test split

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=543553L)

In [None]:
trainIndex, testIndex = list(split.split(data, data.toxic))[0]

In [None]:
train, test = data.iloc[trainIndex], data.iloc[testIndex]

### Tf-idf features

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer().fit(train.comment_text)

In [None]:
trainFeatures = tfidf.transform(train.comment_text)

In [None]:
testFeatures = tfidf.transform(test.comment_text)

## Models for each category

In [None]:
from sklearn.linear_model import LogisticRegression

categories = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
categoryColumns = {category: train[category] for category in categories}

models = {category: LogisticRegression().fit(trainFeatures, column) for (category, column) in categoryColumns.items()}

In [None]:
predictions = {category: model.predict(testFeatures) for (category, model) in models.items()}

In [None]:
predictionProbabilities = {category: model.predict_proba(testFeatures) for (category, model) in models.items()}

In [None]:
categoryColumnsTest = {category: test[category] for category in categories}

In [None]:
from sklearn.metrics import confusion_matrix, f1_score, log_loss

metrics = { category: {
        "Confusion Matrix": confusion_matrix(categoryColumnsTest[category], predictions[category]), 
        "F1 score": f1_score(categoryColumnsTest[category], predictions[category], pos_label=1.0),
        "Logarithmic loss": log_loss(categoryColumnsTest[category], predictionProbabilities[category]) } 
    for category in categories }

In [None]:
for (category, metricValues) in metrics.items():
    print("")
    print("Category: {}".format(category))
    print("-"*50)
    for (name, value) in metricValues.items():
        print(name + ":")
        print(value)
    
print("-"*50 + "\n" + "-"*50 + "\n")
print("Average")
print("-"*50)
for metric in metrics.items()[0][1].keys():
    print(metric + ":")
    print(sum([value[metric] for value in metrics.values()]) / float(len(metrics)))

### Play with the model

In [None]:
def predict(sentence):
    return { category: model.predict_proba(tfidf.transform([sentence]))[0][1] for (category, model) in models.items()}

In [None]:
predict("dick")

## Train on all data and evaluate on the contest test data

### Tf-idf

In [None]:
contestTrain = pd.read_csv("./data/train.csv")

In [None]:
contentTest = pd.read_csv("./data/test.csv")

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

contestTfidf = TfidfVectorizer().fit(contestTrain.comment_text)

In [None]:
contestTrainFeatures = contestTfidf.transform(contestTrain.comment_text)

In [None]:
contestTestFeatures = contestTfidf.transform(contentTest.comment_text)

### Predictions

In [None]:
from sklearn.linear_model import LogisticRegression

categories = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
contestCategoryColumns = {category: contestTrain[category] for category in categories}

contestModels = {category: LogisticRegression().fit(contestTrainFeatures, column) for (category, column) in contestCategoryColumns.items()}

In [None]:
contestPredictionProbabilities = {category: model.predict_proba(contestTestFeatures) for (category, model) in contestModels.items()}

## Export result

In [None]:
suffledResult = pd.DataFrame(dict(
    [("id", contentTest.id)] 
    + [(name, preds[:, 1]) for (name, preds) in contestPredictionProbabilities.items()]))

result = suffledResult[["id"] + categories]

In [None]:
contentTest.head()

In [None]:
result.head()

In [None]:
pd.read_csv("./data/sample_submission.csv").head()

In [None]:
result.to_csv("./submissions/simple-tf-idf.csv", index=False)