In [1]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import ExtraTreesClassifier
import numpy as np
import pandas as pd
from sklearn.metrics import precision_score
from collections import defaultdict
import re
from sklearn.model_selection import train_test_split
import xgboost as xgb

In [2]:
with open("glove/glove.6B.100d.txt", "r") as f:
    w2v = {}
    for line in f:
        split_line = line.split()
        word = split_line[0]
        embedding = np.array(split_line[1:], dtype=np.float64)
        w2v[word] = list(embedding)


In [3]:
def clean_text(text):
    text = text.lower()
    ## remove \n \t and non-alphanumeric
    text = re.sub("(\\t|\\n)", " ", text)
    text = re.sub("[^a-zA-Z']", " ", text)
    text = text.strip()
    ## leave 1 space between each token
    text = " ".join([x for x in text.split(" ") if len(x.strip()) > 0])
    return text.strip()


def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))


toxic_labels = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]


In [4]:
raw = pd.read_csv("train_evenly_distributed.csv")
raw["comment_text"] = raw["comment_text"].apply(lambda x: clean_text(x))
raw["BINARY"] = raw[toxic_labels].apply(
    lambda x: int("".join(x.values.astype(str)), 2), axis=1
)
for k, v in dict(raw.BINARY.value_counts()).items():
    if v < 10:
        raw = raw[raw.BINARY != k]
_train, _test = train_test_split(
    raw,
    test_size=0.2,
    stratify=raw["BINARY"],
)

In [5]:
X_train = []
for x in _train["comment_text"]:
    tmp = [0 for i in range(100)]
    for word in x.split(" "):
        if w2v.get(word, None) != None:
            tmp = np.add(w2v.get(word), tmp)
    X_train.append(tmp)

X_test = []
for x in _test["comment_text"]:
    tmp = [0 for i in range(100)]
    for word in x.split(" "):
        if w2v.get(word, None) != None:
            tmp = np.add(w2v.get(word), tmp)
    X_test.append(tmp)

In [8]:
X_train[:1]

[array([-7.65773251e+00,  2.84830300e+01,  3.57137089e+01, -2.38270171e+01,
        -6.17253180e+00,  1.73495545e+01, -1.12215974e+01,  5.00960600e+00,
        -6.41634800e+00, -5.02261240e+00,  5.15635440e+00, -1.63496700e-01,
         1.98279730e+01, -1.94891711e+00,  1.61480159e+01, -1.82805221e+01,
         8.33629100e+00,  8.24901000e+00, -3.09463427e+01,  2.57246820e+01,
         2.56663600e+01,  9.94889900e-01,  1.21663380e+01, -8.94080710e+00,
         1.40689749e+01, -2.21818106e+01,  8.34299880e-01, -4.23119670e+01,
         3.15361300e-01, -1.32622510e+01,  4.65184400e+00,  3.54178755e+01,
        -2.72349030e+00,  1.85863534e+00,  1.04003858e+01,  2.13956508e+01,
        -1.05955975e-01,  2.15246613e+01, -6.58269850e+00, -6.27271090e+00,
        -4.08633270e+01, -1.63612684e+01,  1.12660258e+01, -2.16741892e+01,
        -8.49140320e+00, -9.80283830e+00,  2.74055650e+01, -2.54774790e+01,
        -1.40261678e+01, -5.95627160e+01,  1.36754610e+01, -9.74088329e+00,
         7.6

In [9]:
label_encoder = {k:i for i, k in enumerate(dict(_train.BINARY.value_counts()).keys())}
label_encoder_reversed = {i:k for i, k in enumerate(dict(_train.BINARY.value_counts()).keys())}
y_test = [label_encoder[x] for x in _test["BINARY"]]
y_train = [label_encoder[x] for x in _train["BINARY"]]

In [None]:
dtrain = xgb.DMatrix(pd.DataFrame.from_records(X_train), label=y_train)
dtest = xgb.DMatrix(pd.DataFrame.from_records(X_test))

In [None]:
param = {
    "max_depth": 10,
    "eta": 0.3,
    "objective": "multi:softprob",
    "num_class": len(label_encoder),
    "eval_metric": "mlogloss",
}
num_round = 5
bst = xgb.train(param, dtrain, num_round)
y_pred_arr = bst.predict(dtest)
y_pred = []
for row in y_pred_arr:
    val, idx = min((val, idx) for (idx, val) in enumerate(row))
    y_pred.append(idx)

p_score = precision_score(y_true=y_test, y_pred=y_pred)

ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].

In [None]:
p_score = precision_score(y_true=y_test, y_pred=y_pred, average='micro')
print(p_score)
print(y_test[-10:])
print(y_pred[-10:])

0.000901510029299076
[1, 8, 1, 0, 6, 1, 1, 1, 0, 2]
[24, 28, 24, 28, 24, 24, 28, 24, 24, 24]
