In [1]:
import warnings
warnings.simplefilter("ignore")
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", None)
import matplotlib.pyplot as plt
%matplotlib inline
import gensim
import nltk
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
train_text = train['comment_text']
test_text = test['comment_text']
train_corpus = list(map(lambda x: gensim.utils.simple_preprocess(x), train_text))
test_corpus = list(map(lambda x: gensim.utils.simple_preprocess(x), test_text))

In [3]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')


def get_embedding(pretrained_word_vectors):
    if pretrained_word_vectors == "google":
        EMBEDDING_PATH = "../embeddings/GoogleNews-vectors-negative300.bin"
        embedding_index = gensim.models.KeyedVectors.load_word2vec_format(EMBEDDING_PATH, binary=True)
    if pretrained_word_vectors == "glove":
        EMBEDDING_PATH = "../embeddings/glove.840B.300d.txt"
        embedding_index = dict(get_coefs(*o.strip().split(" ")) for o in open(EMBEDDING_PATH))
    elif pretrained_word_vectors == "fasttext":
        EMBEDDING_PATH = "../embeddings/crawl-300d-2M.vec"
        embedding_index = dict(get_coefs(*o.strip().split(" ")) for o in open(EMBEDDING_PATH))
    return embedding_index

In [4]:
def calc_vec(corpus, pretrained_word_vectors, embedding_index):
    english_stemmer = nltk.stem.SnowballStemmer("english")
    result = np.zeros((len(corpus), 300))
    for i in range(len(corpus)):
        cnt = 0
        cur_result = np.zeros(300)
        for word in corpus[i]:
            if pretrained_word_vectors == "google":
                try:
                    cur_vec = embedding_index.get_vector(word)
                except Exception:
                    try:
                        word = english_stemmer.stem(word)
                        cur_vec = embedding_index.get_vector(word)
                    except Exception:
                        continue    
            else:
                cur_vec = embedding_index.get(word)
                if cur_vec is None:
                    word = english_stemmer.stem(word)
                    cur_vec = embedding_index.get(word)
                    if cur_vec is None:
                        continue
            cur_result = cur_result + cur_vec
            cnt += 1
        if cnt != 0:
            cur_result /= cnt
        result[i] = cur_result
    return result

In [5]:
embedding_index = get_embedding("google")
X_train = calc_vec(train_corpus, "google", embedding_index)
X_test = calc_vec(test_corpus, "google", embedding_index)
cross_val_scores = []
predictions = {'id': test['id']}
classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
for class_name in classes:
    y_train = train[class_name]
    clf = LogisticRegression(random_state=0)
    cross_val_scores.append(np.mean(cross_val_score(clf, X_train, y_train, scoring='roc_auc')))
    clf.fit(X_train, y_train)
    predictions[class_name] = clf.predict_proba(X_test)[:, 1]
print("cross validation score :", np.mean(cross_val_scores))
submission = pd.DataFrame(predictions, columns=["id"] + classes)
submission.to_csv('submission/v2_Word2Vec_google_submission.csv.gz', compression="gzip", index=False)

cross validation score : 0.963616013836455


In [6]:
embedding_index = get_embedding("glove")
X_train = calc_vec(train_corpus, "glove", embedding_index)
X_test = calc_vec(test_corpus, "glove", embedding_index)
cross_val_scores = []
predictions = {'id': test['id']}
classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
for class_name in classes:
    y_train = train[class_name]
    clf = LogisticRegression(random_state=0)
    cross_val_scores.append(np.mean(cross_val_score(clf, X_train, y_train, scoring='roc_auc')))
    clf.fit(X_train, y_train)
    predictions[class_name] = clf.predict_proba(X_test)[:, 1]
print("cross validation score :", np.mean(cross_val_scores))
submission = pd.DataFrame(predictions, columns=["id"] + classes)
submission.to_csv('submission/v2_Word2Vec_glove_submission.csv.gz', compression="gzip", index=False)

cross validation score : 0.9674673820856302


In [7]:
embedding_index = get_embedding("fasttext")
X_train = calc_vec(train_corpus, "fasttext", embedding_index)
X_test = calc_vec(test_corpus, "fasttext", embedding_index)
cross_val_scores = []
predictions = {'id': test['id']}
classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
for class_name in classes:
    y_train = train[class_name]
    clf = LogisticRegression(random_state=0)
    cross_val_scores.append(np.mean(cross_val_score(clf, X_train, y_train, scoring='roc_auc')))
    clf.fit(X_train, y_train)
    predictions[class_name] = clf.predict_proba(X_test)[:, 1]
print("cross validation score :", np.mean(cross_val_scores))
submission = pd.DataFrame(predictions, columns=["id"] + classes)
submission.to_csv('submission/v2_Word2Vec_fasttext_submission.csv.gz', compression="gzip", index=False)

cross validation score : 0.969067032764253
