# TF-IDF

The idea is use the simple approch of tf-idf using panda and sklearn. If training is slow, launch a large instance in aws to run an extensive grid search.

## Quick look at the shape of the data

In [1]:
import pandas as pd

data = pd.read_csv("./data/train.csv")

In [2]:
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [3]:
data.shape

(159571, 8)

In [4]:
data.describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
count,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0
mean,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805
std,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0


## Evaluation of model on split train dataset

### Train/test split

In [5]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=543553)

In [6]:
trainIndex, testIndex = list(split.split(data, data.toxic))[0]

In [7]:
train, test = data.iloc[trainIndex], data.iloc[testIndex]

### Word2Vec

In [8]:
def splitSentences(dataset):
    return (dataset.comment_text
    .str.replace("[^A-Za-z\s]", "")
    .str.lower()
    .str.split())

In [9]:
%%time
splitTrain = splitSentences(train)
splitTest = splitSentences(test)

CPU times: user 3.25 s, sys: 152 ms, total: 3.4 s
Wall time: 3.41 s


In [10]:
%%time
import gensim

model = gensim.models.KeyedVectors.load_word2vec_format(
    "/home/mariosk/Documents/common-ml-models/GoogleNews-vectors-negative300.bin", 
    binary=True)  

CPU times: user 52.4 s, sys: 5.79 s, total: 58.2 s
Wall time: 58.4 s


In [11]:
%%time
vocabulary = set(model.wv.vocab.keys())

CPU times: user 364 ms, sys: 28 ms, total: 392 ms
Wall time: 389 ms


In [46]:
%%time
def wordsToVector(words):
    allowedWords = [word for word in words if word in vocabulary]
    
    return model.wv[allowedWords if allowedWords else ["hello"]].mean(axis=0)

CPU times: user 25.8 s, sys: 84 ms, total: 25.9 s
Wall time: 25.9 s


In [None]:
import numpy as np

In [None]:
%%time
trainFeatures = np.array(splitTrain.apply(wordsToVector).tolist())

In [49]:
%%time
testFeatures = np.array(splitTest.apply(wordsToVector).tolist())

CPU times: user 6.48 s, sys: 0 ns, total: 6.48 s
Wall time: 6.48 s


### Tf-idf features

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer().fit(train.comment_text)

In [9]:
trainFeatures = tfidf.transform(train.comment_text)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [10]:
testFeatures = tfidf.transform(test.comment_text)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


## Models for each category

In [11]:
def getLabels(dataset, categories):
    return {category: dataset[category] for category in categories}

In [12]:
from sklearn.linear_model import LogisticRegression

def getModels(datasetFeatures, labelColumns):
    return {category: LogisticRegression().fit(datasetFeatures, column) 
            for (category, column) in labelColumns.items()}

In [13]:
def getPredictions(models, datasetFeatures):
    return {category: model.predict(datasetFeatures) for (category, model) in models.items()}

In [14]:
def getProbabilityPredictions(models, datasetFeatures):
    return {category: model.predict_proba(datasetFeatures) for (category, model) in models.items()}

In [15]:
categories = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

trainLabels = getLabels(train, categories)

models = getModels(trainFeatures, trainLabels)

In [16]:
predictions = getPredictions(models, testFeatures)

In [17]:
predictionProbabilities = getProbabilityPredictions(models, testFeatures)

In [18]:
testLabels = getLabels(test, categories)

In [19]:
import numpy as np
import scipy.stats as stats

def cramersV(contmat):
    '''Function to calculate cramers'V and 
    associated p-value using contingency matrix'''
    nrow, ncol = contmat.shape
    nobs = np.sum(contmat.sum())
    chi2, pvalue, dof, expected = stats.chi2_contingency(contmat)
    n = np.min([nrow - 1, ncol - 1])
    v = np.sqrt(chi2 / (nobs * n))
    return np.array([v, pvalue])

In [20]:
def getMetrics(datasetLabels, predictions, predictionProbabilities):
    return { category: {
            "Confusion Matrix": confusion_matrix(datasetLabels[category], predictions[category]), 
            "Relativized Confusion Matrix": confusion_matrix(datasetLabels[category], predictions[category]) / float(len(predictions[category])),
            "F1 score": round(f1_score(datasetLabels[category], predictions[category], pos_label=1.0), 3),
            "Logarithmic loss": round(log_loss(datasetLabels[category], predictionProbabilities[category]), 4),
            "Cramer's V": cramersV(confusion_matrix(datasetLabels[category], predictions[category]))}
        for category in categories }

In [21]:
from sklearn.metrics import confusion_matrix, f1_score, log_loss

metrics = getMetrics(testLabels, predictions, predictionProbabilities)

In [22]:
np.set_printoptions(precision=4, suppress=True)

In [23]:
def printMetrics(metrics):
    for (category, metricValues) in metrics.items():
        print("")
        print("Category: {}".format(category))
        print("-"*50)
        for (name, value) in metricValues.items():
            print(name + ":")
            print(value)

    print("-"*50 + "\n" + "-"*50 + "\n")
    print("Average")
    print("-"*50)
    for metric in list(metrics.items())[0][1].keys():
        print(metric + ":")
        print(sum([value[metric] for value in metrics.values()]) / float(len(metrics)))

### Print metrics for test dataset

In [24]:
printMetrics(metrics)


Category: toxic
--------------------------------------------------
Confusion Matrix:
[[28690   166]
 [ 1219  1840]]
Relativized Confusion Matrix:
[[0.899  0.0052]
 [0.0382 0.0577]]
F1 score:
0.727
Logarithmic loss:
0.117
Cramer's V:
[0.7224 0.    ]

Category: severe_toxic
--------------------------------------------------
Confusion Matrix:
[[31532    68]
 [  229    86]]
Relativized Confusion Matrix:
[[0.988  0.0021]
 [0.0072 0.0027]]
F1 score:
0.367
Logarithmic loss:
0.0274
Cramer's V:
[0.3841 0.    ]

Category: obscene
--------------------------------------------------
Confusion Matrix:
[[30139    88]
 [  664  1024]]
Relativized Confusion Matrix:
[[0.9444 0.0028]
 [0.0208 0.0321]]
F1 score:
0.731
Logarithmic loss:
0.0679
Cramer's V:
[0.7365 0.    ]

Category: threat
--------------------------------------------------
Confusion Matrix:
[[31803     8]
 [   98     6]]
Relativized Confusion Matrix:
[[0.9965 0.0003]
 [0.0031 0.0002]]
F1 score:
0.102
Logarithmic loss:
0.0129
Cramer's V:
[0.

### Print metrics for train dataset

In [25]:
printMetrics(getMetrics(
    trainLabels, 
    getPredictions(models, trainFeatures), 
    getProbabilityPredictions(models, trainFeatures)))


Category: toxic
--------------------------------------------------
Confusion Matrix:
[[114962    459]
 [  4435   7800]]
Relativized Confusion Matrix:
[[0.9006 0.0036]
 [0.0347 0.0611]]
F1 score:
0.761
Logarithmic loss:
0.1022
Cramer's V:
[0.7581 0.    ]

Category: severe_toxic
--------------------------------------------------
Confusion Matrix:
[[126206    170]
 [   943    337]]
Relativized Confusion Matrix:
[[0.9886 0.0013]
 [0.0074 0.0026]]
F1 score:
0.377
Logarithmic loss:
0.0235
Cramer's V:
[0.4143 0.    ]

Category: obscene
--------------------------------------------------
Confusion Matrix:
[[120614    281]
 [  2386   4375]]
Relativized Confusion Matrix:
[[0.9448 0.0022]
 [0.0187 0.0343]]
F1 score:
0.766
Logarithmic loss:
0.0574
Cramer's V:
[0.7702 0.    ]

Category: threat
--------------------------------------------------
Confusion Matrix:
[[127267     15]
 [   318     56]]
Relativized Confusion Matrix:
[[0.997  0.0001]
 [0.0025 0.0004]]
F1 score:
0.252
Logarithmic loss:
0.008

### Play with the model

In [26]:
def predict(sentence):
    return { category: round(model.predict_proba(tfidf.transform([sentence]))[0][1], 3) for (category, model) in models.items()}

In [27]:
predict("dick")

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


{'identity_hate': 0.031,
 'insult': 0.843,
 'obscene': 0.995,
 'severe_toxic': 0.441,
 'threat': 0.005,
 'toxic': 0.995}

## Train on all data and evaluate on the contest test data

### Tf-idf

In [28]:
import pandas as pd

contestTrain = pd.read_csv("./data/train.csv")

In [29]:
contentTest = pd.read_csv("./data/test.csv")

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

contestTfidf = TfidfVectorizer().fit(contestTrain.comment_text)

In [31]:
contestTrainFeatures = contestTfidf.transform(contestTrain.comment_text)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [32]:
contestTestFeatures = contestTfidf.transform(contentTest.comment_text)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


### Predictions

In [33]:
from sklearn.linear_model import LogisticRegression

categories = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
contestCategoryColumns = {category: contestTrain[category] for category in categories}

contestModels = {category: LogisticRegression().fit(contestTrainFeatures, column) for (category, column) in contestCategoryColumns.items()}

In [34]:
contestPredictionProbabilities = {category: model.predict_proba(contestTestFeatures) for (category, model) in contestModels.items()}

## Export result

In [35]:
suffledResult = pd.DataFrame(dict(
    [("id", contentTest.id)] 
    + [(name, preds[:, 1]) for (name, preds) in contestPredictionProbabilities.items()]))

result = suffledResult[["id"] + categories]

In [36]:
contentTest.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [37]:
result.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.998337,0.17939,0.994815,0.046207,0.961407,0.280959
1,0000247867823ef7,0.0066,0.001563,0.003997,0.000433,0.005721,0.003058
2,00013b17ad220c46,0.04241,0.006402,0.020804,0.001987,0.023446,0.006136
3,00017563c3f7919a,0.003176,0.001637,0.003099,0.001038,0.003707,0.00093
4,00017695ad8997eb,0.034391,0.004016,0.01118,0.001724,0.011583,0.00361


In [None]:
pd.read_csv("./data/sample_submission.csv").head()

In [None]:
result.to_csv("./submissions/simple-tf-idf-without-exponents.csv", index=False)