In [1]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import ExtraTreesClassifier
import numpy as np
import pandas as pd
from collections import defaultdict
import re
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import (
    roc_auc_score,
    mean_squared_error,
    accuracy_score,
    mean_absolute_error,
    recall_score,
    precision_score,
    f1_score,
)

In [2]:
with open("glove/glove.6B.100d.txt", "r") as f:
    w2v = {}
    for line in f:
        split_line = line.split()
        word = split_line[0]
        embedding = np.array(split_line[1:], dtype=np.float64)
        w2v[word] = list(embedding)


In [3]:
def clean_text(text):
    text = text.lower()
    ## remove \n \t and non-alphanumeric
    text = re.sub("(\\t|\\n)", " ", text)
    text = re.sub("[^a-zA-Z']", " ", text)
    text = text.strip()
    ## leave 1 space between each token
    text = " ".join([x for x in text.split(" ") if len(x.strip()) > 0])
    return text.strip()


def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))


toxic_labels = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]


In [4]:
raw = pd.read_csv("train_evenly_distributed.csv")
raw["comment_text"] = raw["comment_text"].apply(lambda x: clean_text(x))
raw["BINARY"] = raw[toxic_labels].apply(
    lambda x: int("".join(x.values.astype(str)), 2), axis=1
)
for k, v in dict(raw.BINARY.value_counts()).items():
    if v < 10:
        raw = raw[raw.BINARY != k]
_train, _test = train_test_split(
    raw,
    test_size=0.2,
    stratify=raw["BINARY"],
)

In [5]:
X_train = []
for x in _train["comment_text"]:
    tmp = [0 for i in range(100)]
    for word in x.split(" "):
        if w2v.get(word, None) != None:
            tmp = np.add(w2v.get(word), tmp)
    X_train.append(tmp)

X_test = []
for x in _test["comment_text"]:
    tmp = [0 for i in range(100)]
    for word in x.split(" "):
        if w2v.get(word, None) != None:
            tmp = np.add(w2v.get(word), tmp)
    X_test.append(tmp)

In [6]:
label_encoder = {k:i for i, k in enumerate(dict(_train.BINARY.value_counts()).keys())}
label_encoder_reversed = {i:k for i, k in enumerate(dict(_train.BINARY.value_counts()).keys())}
y_test = [label_encoder[x] for x in _test["BINARY"]]
y_train = [label_encoder[x] for x in _train["BINARY"]]

In [7]:
dtrain = xgb.DMatrix(pd.DataFrame.from_records(X_train), label=y_train)
dtest = xgb.DMatrix(pd.DataFrame.from_records(X_test))

In [8]:
param = {
    "max_depth": 10,
    "eta": 0.3,
    "objective": "multi:softprob",
    "num_class": len(label_encoder),
    "eval_metric": "mlogloss",
}
num_round = 5
bst = xgb.train(param, dtrain, num_round)
y_pred_arr = bst.predict(dtest)
y_pred = []
for row in y_pred_arr:
    val, idx = min((val, idx) for (idx, val) in enumerate(row))
    y_pred.append(idx)

print("Binary format with word2vec (Glove)")
print("precision: %s" % precision_score(y_pred, y_test, average="micro"))
print("recall: %s" % recall_score(y_pred, y_test, average="micro"))
print("f1 score: %s" % f1_score(y_pred, y_test, average="micro"))
print("accuracy: %s" % accuracy_score(y_pred, y_test))
print()

Binary format with word2vec (Glove)
precision: 0.001352265043948614
recall: 0.001352265043948614
f1 score: 0.001352265043948614
accuracy: 0.001352265043948614



In [None]:
p_score = precision_score(y_true=y_test, y_pred=y_pred, average='micro')
print(p_score)
print(y_test[-10:])
print(y_pred[-10:])

0.000901510029299076
[1, 8, 1, 0, 6, 1, 1, 1, 0, 2]
[24, 28, 24, 28, 24, 24, 28, 24, 24, 24]
