In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
import re
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import (
    TfidfVectorizer,
    CountVectorizer,
    TfidfTransformer,
)
from sklearn.metrics import (
    roc_auc_score,
    mean_squared_error,
    accuracy_score,
    mean_absolute_error,
)
from sklearn.model_selection import StratifiedKFold
from bayes_opt import BayesianOptimization
from nltk.corpus import stopwords
import spacy
from sklearn.decomposition import TruncatedSVD
import pickle
import numpy as np
import pickle
from gensim.models import Word2Vec
import gensim.downloader as gensim_api


In [12]:
sw = set(stopwords.words("english"))
spacy_nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [13]:
toxic_labels = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

train = pd.read_csv("train_evenly_distributed.csv")
evaluation = pd.read_csv("test.csv")
evaluation_labels = pd.read_csv("test_labels.csv")

model = XGBClassifier(random_state=69, seed=2, colsample_bytree=0.6, subsample=0.7)

param_grid = {
    "clf__n_estimators": [50, 100, 300],
    "clf__colsample_bytree": [0.6, 0.8, 1],
    "clf__subsample": [0.5, 0.6, 0.7, 0.8, 0.9, 1],
}


In [14]:
"""
Cleaned and ran it once, stored in "clean.csv" 
Takes about 15 min on my com
"""

## clean, stem and generate word set
# def clean(text):
#     text = text.lower()
#     ## remove \n \t and non-alphanumeric
#     text = re.sub("(\\t|\\n)", " ", text)
#     text = re.sub("[^a-zA-Z]", " ", text)
#     ## remove empty tokens
#     text = " ".join([x.strip() for x in text.split(" ") if len(x.strip()) > 0])
#     ## lemmatise
#     doc = spacy_nlp(text)
#     text = " ".join([x.lemma_ for x in doc if not x.is_stop])
#     return text.strip()


# train["comment_text"] = train["comment_text"].apply(lambda x: clean(x))

# with open("clean.csv", "w+") as f:
#     train.to_csv(f)


'\nCleaned and ran it once, stored in "clean.csv" \nTakes about 15 min on my com\n'

# Training a new model for each category 

In [15]:
clean = pd.read_csv("train_evenly_distributed.csv")
clean.dropna(inplace=True)
clean["comment_text"] = clean["comment_text"].str.replace(",", "")


# Boring TFIDF

In [22]:
## without glove and basic tfidf

## old pipeline
# pipe = Pipeline(
#     [
#         ("vect", CountVectorizer()),
#         ("tfidf", TfidfTransformer()),
#         ("classifier", LogisticRegression()),
#     ]
# )

## new pipeline but super slow
pipe = Pipeline([("classifier", LogisticRegression())])

vectoriser = TfidfVectorizer(
    analyzer="word",
    max_features=100000,
    ngram_range=(1, 2),
    lowercase=True,
    stop_words="english",
)
vectors = vectoriser.fit_transform(clean["comment_text"])
manual_train = pd.DataFrame(
    data=vectors.toarray(), columns=vectoriser.get_feature_names()
)
X_train, X_test, y_train, y_test = train_test_split(
    manual_train, clean[tox], test_size=0.20, random_state=69
)

param_grid = [
    {
        "classifier": [LogisticRegression()],
        "classifier__penalty": ["l1", "l2"],
        "classifier__C": np.logspace(-4, 4, 20),
        "classifier__solver": ["liblinear"],
        "classifier__max_iter": [1000, 5000],
    },
]

log_reg_models = {}
for tox in toxic_labels:
    # X_train, X_test, y_train, y_test = train_test_split(
    #     clean["comment_text"], clean[tox], test_size=0.20, random_state=69
    # )
    # model = GridSearchCV(pipe, param_grid=param_grid, cv=3, verbose=False, n_jobs=1)
    model = LogisticRegression()
    model.fit(X_train, y_train)
    print(model.score(X_test, y_test))
    with open(f"./models/{tox}.sav", "wb+") as f:
        pickle.dump(model, f)
    # log_reg_models[tox] = model.best_params_["classifier"]


0.8429696287964005
0.8429696287964005
0.8429696287964005
0.8429696287964005
0.8429696287964005
0.8429696287964005


In [None]:
log_reg_models

{'toxic': LogisticRegression(C=1.623776739188721, max_iter=1000, penalty='l1',
                    solver='liblinear'),
 'severe_toxic': LogisticRegression(C=1.623776739188721, max_iter=1000, penalty='l1',
                    solver='liblinear'),
 'obscene': LogisticRegression(C=1.623776739188721, max_iter=1000, penalty='l1',
                    solver='liblinear'),
 'threat': LogisticRegression(C=1.623776739188721, max_iter=1000, penalty='l1',
                    solver='liblinear'),
 'insult': LogisticRegression(C=1.623776739188721, max_iter=1000, penalty='l1',
                    solver='liblinear'),
 'identity_hate': LogisticRegression(C=1.623776739188721, max_iter=1000, penalty='l1',
                    solver='liblinear')}

# Word2Vec

In [31]:
def clean_text(text):
    text = text.lower()
    ## remove \n \t and non-alphanumeric
    text = re.sub("(\\t|\\n)", " ", text)
    text = re.sub("[^a-zA-Z]", " ", text)
    ## remove empty tokens
    return [x.strip() for x in text.split(" ") if len(x.strip()) > 0]

nlp = gensim_api.load("word2vec-google-news-300")
unigram_corpus = []
for s in clean["comment_text"]:
    unigram_corpus.append(clean_text(s))

unigram_corpus[:10]




[['cocksucker', 'before', 'you', 'piss', 'around', 'on', 'my', 'work'],
 ['hey',
  'what',
  'is',
  'it',
  'talk',
  'what',
  'is',
  'it',
  'an',
  'exclusive',
  'group',
  'of',
  'some',
  'wp',
  'talibans',
  'who',
  'are',
  'good',
  'at',
  'destroying',
  'self',
  'appointed',
  'purist',
  'who',
  'gang',
  'up',
  'any',
  'one',
  'who',
  'asks',
  'them',
  'questions',
  'abt',
  'their',
  'anti',
  'social',
  'and',
  'destructive',
  'non',
  'contribution',
  'at',
  'wp',
  'ask',
  'sityush',
  'to',
  'clean',
  'up',
  'his',
  'behavior',
  'than',
  'issue',
  'me',
  'nonsensical',
 ['bye',
  'don',
  't',
  'look',
  'come',
  'or',
  'think',
  'of',
  'comming',
  'back',
  'tosser'],
 ['you',
  'are',
  'gay',
  'or',
  'antisemmitian',
  'archangel',
  'white',
  'tiger',
  'meow',
  'greetingshhh',
  'uh',
  'there',
  'are',
  'two',
  'ways',
  'why',
  'you',
  'do',
  'erased',
  'my',
  'comment',
  'about',
  'ww',
  'that',
  'holocaust',

In [None]:
unseen = pd.DataFrame.from_dict(
    {"comment_text": ["go and fuck yourself and your mom too"]}
)
for tox in toxic_labels:
    pipe = Pipeline(
        [
            ("vect", CountVectorizer()),
            ("tfidf", TfidfTransformer()),
            ("classifier", log_reg_models[tox]),
        ]
    )
    pipe.fit(clean["comment_text"], clean[tox])
    print(f"{tox}: {pipe.predict(unseen)}")


toxic: [1]
severe_toxic: [0]
obscene: [0]
threat: [0]
insult: [0]
identity_hate: [0]
