In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
import re
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import roc_auc_score, mean_squared_error, accuracy_score, mean_absolute_error
from bayes_opt import BayesianOptimization
from nltk.corpus import stopwords
import spacy
from sklearn.decomposition import TruncatedSVD
import pickle
import numpy as np

In [3]:
sw = set(stopwords.words("english"))
spacy_nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [4]:
toxic_labels = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

train = pd.read_csv("train.csv")
evaluation = pd.read_csv("test.csv")
evaluation_labels = pd.read_csv("test_labels.csv")

model = XGBClassifier(random_state=69, seed=2, colsample_bytree=0.6, subsample=0.7)

param_grid = {
    "clf__n_estimators": [50, 100, 300],
    "clf__colsample_bytree": [0.6, 0.8, 1],
    "clf__subsample": [0.5, 0.6, 0.7, 0.8, 0.9, 1],
}


In [5]:
"""
Cleaned and ran it once, stored in "clean.csv" 
Takes about 15 min on my com
"""

## clean, stem and generate word set
# def clean(text):
#     text = text.lower()
#     ## remove \n \t and non-alphanumeric
#     text = re.sub("(\\t|\\n)", " ", text)
#     text = re.sub("[^a-zA-Z]", " ", text)
#     ## remove empty tokens
#     text = " ".join([x.strip() for x in text.split(" ") if len(x.strip()) > 0])
#     ## lemmatise
#     doc = spacy_nlp(text)
#     text = " ".join([x.lemma_ for x in doc if not x.is_stop])
#     return text.strip()


# train["comment_text"] = train["comment_text"].apply(lambda x: clean(x))

# with open("clean.csv", "w+") as f:
#     train.to_csv(f)


'\nCleaned and ran it once, stored in "clean.csv" \nTakes about 15 min on my com\n'

In [6]:
clean = pd.read_csv("clean.csv")
clean.dropna(inplace=True)
clean["comment_text"] = clean["comment_text"].str.replace(",", "")
clean = clean.iloc[:1000]

In [24]:
## without glove and basic tfidf
pipe = Pipeline(
    [
        ("vect", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        ("classifier", LogisticRegression()),
    ]
)

param_grid = [
    {
        "classifier": [LogisticRegression()],
        "classifier__penalty": ["l1", "l2"],
        "classifier__C": np.logspace(-4, 4, 20),
        "classifier__solver": ["liblinear"],
        "classifier__max_iter": [500, 1000],
    },
]

log_reg_models = {}
for tox in toxic_labels:
    X_train, X_test, y_train, y_test = train_test_split(
        clean["comment_text"], clean[tox], test_size=0.20, random_state=69
    )
    model = GridSearchCV(pipe, param_grid=param_grid, cv=3, verbose=False, n_jobs=1)
    model.fit(X_train, y_train)
    print(model.score(X_test, y_test))
    log_reg_models[tox] = model.best_params_['classifier']



0.9608076754248448




0.9898413494701198
0.9798394682385402




0.9966137831567066
0.9708722643757447




0.9915344578917665


In [25]:
log_reg_models

{'toxic': LogisticRegression(C=4.281332398719396, solver='liblinear'),
 'severe_toxic': LogisticRegression(C=4.281332398719396, solver='liblinear'),
 'obscene': LogisticRegression(C=4.281332398719396, solver='liblinear'),
 'threat': LogisticRegression(C=4.281332398719396, solver='liblinear'),
 'insult': LogisticRegression(C=4.281332398719396, solver='liblinear'),
 'identity_hate': LogisticRegression(C=4.281332398719396, solver='liblinear')}

In [26]:
unseen = pd.DataFrame.from_dict(
    {"comment_text": ["you are a piece of shit go kill yourself"]}
)

for tox in toxic_labels:
    X_train, X_test, y_train, y_test = train_test_split(
        clean["comment_text"], clean[tox], test_size=0.20, random_state=69
    )
    pipe = Pipeline(
        [
            ("vect", CountVectorizer()),
            ("tfidf", TfidfTransformer()),
            ("classifier", log_reg_models[tox]),
        ]
    )
    pipe.fit(X_train, y_train)
    print(f"{tox}: {pipe.predict(unseen)}")


toxic: [0]
severe_toxic: [0]
obscene: [0]
threat: [0]
insult: [0]
identity_hate: [0]


In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    clean["comment_text"], clean[toxic_labels], test_size=0.20, random_state=69
)

## create tokeniser
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train)

## Convert sentences to vectors
X_train_vectors = tokenizer.texts_to_sequences(X_train)
X_test_vectors = tokenizer.texts_to_sequences(X_test)

max_length = 0
for row in X_train_vectors + X_test_vectors:
    max_length = max(max_length, len(row))

## added extra 10 to max_length just in case future vecotrs are longer
max_length += 10

## add extra 0 to make sure all vectors are same length
X_train_vectors = pad_sequences(X_train_vectors, padding="post", maxlen=max_length)
X_test_vectors = pad_sequences(X_test_vectors, padding="post", maxlen=max_length)
print(X_train_vectors[:10])
# for tox in toxic_labels:
#     bst = XGBClassifier()
#     bst.fit(X_train_vectors, y_train[tox])
#     y_pred = bst.predict(X_test)
#     print(f"{accuracy_score(y_test, y_pred)}")

[[ 445  630  643 ...    0    0    0]
 [4026  219    6 ...    0    0    0]
 [  61 1895   16 ...    0    0    0]
 ...
 [1903  820 3799 ...    0    0    0]
 [1734  719  719 ...    0    0    0]
 [2316  496  165 ...    0    0    0]]
