In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
import re
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import (
    roc_auc_score,
    mean_squared_error,
    accuracy_score,
    mean_absolute_error,
    precision_score,
)
from sklearn.model_selection import StratifiedKFold
from bayes_opt import BayesianOptimization
from nltk.corpus import stopwords
import spacy
from sklearn.decomposition import TruncatedSVD
import pickle
import numpy as np
import gc


In [17]:
sw = set(stopwords.words("english"))
spacy_nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [18]:
toxic_labels = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

train = pd.read_csv("train_evenly_distributed.csv")
evaluation = pd.read_csv("test.csv")
evaluation_labels = pd.read_csv("test_labels.csv")

model = XGBClassifier(random_state=69, seed=2, colsample_bytree=0.6, subsample=0.7)

param_grid = {
    "clf__n_estimators": [50, 100, 300],
    "clf__colsample_bytree": [0.6, 0.8, 1],
    "clf__subsample": [0.5, 0.6, 0.7, 0.8, 0.9, 1],
}


# Treating the combination of classes as a binary number

e.g. 100001 --> 33  
int("100001", 2)  

In [19]:
"""
Manual
remove non-letters --> count vectorizer --> TFIDF --> model
"""
def clean_text(text):
    text = text.lower()
    ## remove \n \t and non-alphanumeric
    text = re.sub("(\\t|\\n)", " ", text)
    text = re.sub("[^a-zA-Z]", " ", text)
    text = text.strip()
    ## leave 1 space between each token
    text = " ".join([x for x in text.split(" ") if len(x.strip()) > 0])
    return text.strip()


manual_train = train.copy()
manual_train["comment_text"] = manual_train["comment_text"].apply(
    lambda x: clean_text(x)
)
manual_train["BINARY"] = train[toxic_labels].apply(
    lambda x: int("".join(x.values.astype(str)), 2), axis=1
)
target_df = manual_train["BINARY"]

In [20]:
tfidf_vec = TfidfVectorizer(analyzer="word", stop_words="english")
vectors = tfidf_vec.fit_transform(manual_train["comment_text"])
manual_train = pd.DataFrame(data=vectors.toarray(), columns=tfidf_vec.get_feature_names())
manual_train["BINARY"] = target_df

In [21]:
## drop classes with less than 10 occurences
for k, v in dict(manual_train.BINARY.value_counts()).items():
    if v < 10:
        manual_train = manual_train[manual_train.BINARY != k]

_train, _test = train_test_split(
    manual_train,
    test_size=0.2,
    stratify=manual_train["BINARY"],
)

In [22]:
X_train = _train.drop(["BINARY"], axis=1)
y_train = _train["BINARY"]
X_test = _test.drop(["BINARY"], axis=1)
y_test = _test["BINARY"]

In [23]:
X_train.iloc[:, 300:400]

Unnamed: 0,accuses,accusin,accusing,accussed,accustation,accustomed,ace,acedemics,aces,aceshowbiz,...,activating,active,actively,activision,activisionvalue,activism,activist,activists,activistsinlasvegas,activite
2956,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0
19534,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0
20236,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.14083,0.0,0.0
7820,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0
4896,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21873,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0
21983,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0
16276,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0
2605,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0


In [25]:
np.array(sorted(y_train.unique()))
len(y_train.unique())


29

In [28]:
label_encoder = {k:i for i, k in enumerate(dict(_train.BINARY.value_counts()).keys())}
label_encoder_reversed = {i:k for i, k in enumerate(dict(_train.BINARY.value_counts()).keys())}

In [29]:
y_test = [label_encoder[x] for x in y_test]
y_train = [label_encoder[x] for x in y_train]

In [35]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test)

In [42]:
param = {
    'max_depth': 10,
    'eta': 0.3,
    'objective': 'multi:softprob',
    'num_class': len(label_encoder)}
num_round = 5
bst = xgb.train(param, dtrain, num_round)
y_pred = bst.predict(dtest)



ValueError: Classification metrics can't handle a mix of multiclass and continuous-multioutput targets

In [45]:
y_predd = []
for row in y_pred:
    val, idx = min((val, idx) for (idx, val) in enumerate(row))
    y_predd.append(idx)
p_score = precision_score(y_true=y_test, y_pred=y_predd, average="micro")
print(p_score)

0.000901510029299076


In [11]:
## takes forever on my com lol

## hyper param tuning
# def xgb_evaluation(
#     max_depth,
#     min_child_weight,
#     gamma,
#     subsample,
#     colsample_bytree,
#     colsample_bylevel,
#     colsample_bynode,
#     reg_alpha,
#     reg_lambda,
# ):

#     params = {
#         "learning_rate": 0.01,
#         "n_estimators": 10000,
#         "max_depth": int(round(max_depth)),
#         "min_child_weight": int(round(min_child_weight)),
#         "subsample": subsample,
#         "gamma": gamma,
#         "colsample_bytree": colsample_bytree,
#         "colsample_bylevel": colsample_bylevel,
#         "colsample_bynode": colsample_bynode,
#         "reg_alpha": reg_alpha,
#         "reg_lambda": reg_lambda,
#         "random_state": 51412,
#     }

#     xgbc = XGBClassifier(**params)
#     xgbc.fit(X_train, y_train)
#     preds = xgbc.predict_proba(X_test, ntree_limit=xgbc.get_booster().best_ntree_limit)[
#         :, 1
#     ]
#     gc.collect()
#     return mean_absolute_error(y_test, preds)


# bopt_xgb = BayesianOptimization(
#     xgb_evaluation,
#     {
#         "max_depth": (5, 15),
#         "min_child_weight": (5, 80),
#         "gamma": (0.2, 1),
#         "subsample": (0.5, 1),
#         "colsample_bytree": (0.5, 1),
#         "colsample_bylevel": (0.3, 1),
#         "colsample_bynode": (0.3, 1),
#         "reg_alpha": (0.001, 0.3),
#         "reg_lambda": (0.001, 0.3),
#     },
#     random_state=55,
# )
# bopt_xgb.maximize(n_iter=6, init_points=4)


  return f(**kwargs)


|   iter    |  target   | colsam... | colsam... | colsam... |   gamma   | max_depth | min_ch... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------
