In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
import re
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import roc_auc_score, mean_squared_error, accuracy_score, mean_absolute_error
from sklearn.model_selection import StratifiedKFold
from bayes_opt import BayesianOptimization
from nltk.corpus import stopwords
import spacy
from sklearn.decomposition import TruncatedSVD
import pickle
import numpy as np

In [3]:
sw = set(stopwords.words("english"))
spacy_nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [4]:
toxic_labels = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

train = pd.read_csv("train_evenly_distributed.csv")
evaluation = pd.read_csv("test.csv")
evaluation_labels = pd.read_csv("test_labels.csv")

model = XGBClassifier(random_state=69, seed=2, colsample_bytree=0.6, subsample=0.7)

param_grid = {
    "clf__n_estimators": [50, 100, 300],
    "clf__colsample_bytree": [0.6, 0.8, 1],
    "clf__subsample": [0.5, 0.6, 0.7, 0.8, 0.9, 1],
}


In [36]:
"""
Manual
remove non-letters --> count vectorizer --> TFIDF --> model
"""


def clean_text(text):
    text = text.lower()
    ## remove \n \t and non-alphanumeric
    text = re.sub("(\\t|\\n)", " ", text)
    text = re.sub("[^a-zA-Z]", " ", text)
    text = text.strip()
    ## leave 1 space between each token
    text = " ".join([x for x in text.split(" ") if len(x.strip()) > 0])
    return text.strip()


manual_train = train.copy()
manual_train["comment_text"] = manual_train["comment_text"].apply(
    lambda x: clean_text(x)
)
manual_train["binary"] = train[toxic_labels].apply(
    lambda x: int("".join(x.values.astype(str)), 2), axis=1
)

tfidf_vec = TfidfVectorizer(analyzer="word", stop_words="english")
vectors = tfidf_vec.fit_transform(manual_train["comment_text"].iloc[:100])
train_df = pd.DataFrame(data=vectors.toarray(), columns=tfidf_vec.get_feature_names())
target_df = manual_train["binary"].iloc[:100]

## manually split cuz of insuffient values
X_train, X_test, y_train, y_test = train_test_split(
    train_df, target_df, test_size=0.20, random_state=69
)


def xgb_evaluation(
    max_depth,
    min_child_weight,
    gamma,
    subsample,
    colsample_bytree,
    colsample_bylevel,
    colsample_bynode,
    reg_alpha,
    reg_lambda,
):

    params = {
        "learning_rate": 0.01,
        "n_estimators": 10000,
        "max_depth": int(round(max_depth)),
        "min_child_weight": int(round(min_child_weight)),
        "subsample": subsample,
        "gamma": gamma,
        "colsample_bytree": colsample_bytree,
        "colsample_bylevel": colsample_bylevel,
        "colsample_bynode": colsample_bynode,
        "reg_alpha": reg_alpha,
        "reg_lambda": reg_lambda,
        "random_state": 51412,
    }

    xgbc = XGBClassifier(**params)
    xgbc.fit(
        X_train,
        y_train,
        eval_set=[(train_df, target_df)],
        eval_metric="auc",
        verbose=False,
        early_stopping_rounds=200,
    )
    preds = xgbc.predict_proba(X_test, ntree_limit=xgbc.get_booster().best_ntree_limit)[
        :, 1
    ]

    return (y_test, preds)


bopt_xgb = BayesianOptimization(
    xgb_evaluation,
    {
        "max_depth": (5, 15),
        "min_child_weight": (5, 80),
        "gamma": (0.2, 1),
        "subsample": (0.5, 1),
        "colsample_bytree": (0.5, 1),
        "colsample_bylevel": (0.3, 1),
        "colsample_bynode": (0.3, 1),
        "reg_alpha": (0.001, 0.3),
        "reg_lambda": (0.001, 0.3),
    },
    random_state=55,
)
bopt_xgb.maximize(n_iter=6, init_points=4)


|   iter    |  target   | colsam... | colsam... | colsam... |   gamma   | max_depth | min_ch... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------




ValueError: y contains previously unseen labels: [59]

In [5]:
"""
Cleaned and ran it once, stored in "clean.csv" 
Takes about 15 min on my com
"""

## clean, stem and generate word set
# def clean(text):
#     text = text.lower()
#     ## remove \n \t and non-alphanumeric
#     text = re.sub("(\\t|\\n)", " ", text)
#     text = re.sub("[^a-zA-Z]", " ", text)
#     ## remove empty tokens
#     text = " ".join([x.strip() for x in text.split(" ") if len(x.strip()) > 0])
#     ## lemmatise
#     doc = spacy_nlp(text)
#     text = " ".join([x.lemma_ for x in doc if not x.is_stop])
#     return text.strip()


# train["comment_text"] = train["comment_text"].apply(lambda x: clean(x))

# with open("clean.csv", "w+") as f:
#     train.to_csv(f)


'\nCleaned and ran it once, stored in "clean.csv" \nTakes about 15 min on my com\n'

# Training a new model for each category 

In [37]:
clean = pd.read_csv("train_evenly_distributed.csv")
clean.dropna(inplace=True)
clean["comment_text"] = clean["comment_text"].str.replace(",", "")
clean = clean.iloc[:1000]

In [38]:
## without glove and basic tfidf
pipe = Pipeline(
    [
        ("vect", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        ("classifier", LogisticRegression()),
    ]
)

param_grid = [
    {
        "classifier": [LogisticRegression()],
        "classifier__penalty": ["l1", "l2"],
        "classifier__C": np.logspace(-4, 4, 20),
        "classifier__solver": ["liblinear"],
        "classifier__max_iter": [1000, 5000],
    },
]

log_reg_models = {}
for tox in toxic_labels:
    X_train, X_test, y_train, y_test = train_test_split(
        clean["comment_text"], clean[tox], test_size=0.20, random_state=69
    )
    model = GridSearchCV(pipe, param_grid=param_grid, cv=3, verbose=False, n_jobs=1)
    model.fit(X_train, y_train)
    print(model.score(X_test, y_test))
    log_reg_models[tox] = model.best_params_['classifier']



0.955




0.915




0.78




0.98




0.7




0.91


In [41]:
log_reg_models

{'toxic': LogisticRegression(C=1.623776739188721, max_iter=1000, penalty='l1',
                    solver='liblinear'),
 'severe_toxic': LogisticRegression(C=1.623776739188721, max_iter=1000, penalty='l1',
                    solver='liblinear'),
 'obscene': LogisticRegression(C=1.623776739188721, max_iter=1000, penalty='l1',
                    solver='liblinear'),
 'threat': LogisticRegression(C=1.623776739188721, max_iter=1000, penalty='l1',
                    solver='liblinear'),
 'insult': LogisticRegression(C=1.623776739188721, max_iter=1000, penalty='l1',
                    solver='liblinear'),
 'identity_hate': LogisticRegression(C=1.623776739188721, max_iter=1000, penalty='l1',
                    solver='liblinear')}

In [47]:
unseen = pd.DataFrame.from_dict(
    {"comment_text": ["go and fuck yourself and your mom too"]}
)
for tox in toxic_labels:
    pipe = Pipeline(
        [
            ("vect", CountVectorizer()),
            ("tfidf", TfidfTransformer()),
            ("classifier", log_reg_models[tox]),
        ]
    )
    pipe.fit(clean["comment_text"], clean[tox])
    print(f"{tox}: {pipe.predict(unseen)}")


toxic: [1]
severe_toxic: [0]
obscene: [0]
threat: [0]
insult: [0]
identity_hate: [0]
