In [75]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
import re
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import roc_auc_score, mean_squared_error, accuracy_score, mean_absolute_error
from bayes_opt import BayesianOptimization
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import spacy
from sklearn.decomposition import TruncatedSVD

In [22]:
sw = set(stopwords.words("english"))
spacy_nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [129]:
toxic_labels = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

train = pd.read_csv("train.csv")
evaluation = pd.read_csv("test.csv")
evaluation_labels = pd.read_csv("test_labels.csv")

model = XGBClassifier(random_state=69, seed=2, colsample_bytree=0.6, subsample=0.7)

param_grid = {
    "clf__n_estimators": [50, 100, 300],
    "clf__colsample_bytree": [0.6, 0.8, 1],
    "clf__subsample": [0.5, 0.6, 0.7, 0.8, 0.9, 1],
}


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             159571 non-null  object
 1   comment_text   159571 non-null  object
 2   toxic          159571 non-null  int64 
 3   severe_toxic   159571 non-null  int64 
 4   obscene        159571 non-null  int64 
 5   threat         159571 non-null  int64 
 6   insult         159571 non-null  int64 
 7   identity_hate  159571 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 9.7+ MB
None


In [74]:
"""
Cleaned and ran it once, stored in "clean.csv" 
Takes about 15 min on my com
"""

## clean, stem and generate word set
# def clean(text):
#     text = text.lower()
#     ## remove \n \t and non-alphanumeric
#     text = re.sub("(\\t|\\n)", " ", text)
#     text = re.sub("[^a-zA-Z]", " ", text)
#     ## remove empty tokens
#     text = " ".join([x.strip() for x in text.split(" ") if len(x.strip()) > 0])
#     ## lemmatise
#     doc = spacy_nlp(text)
#     text = " ".join([x.lemma_ for x in doc if not x.is_stop])
#     return text.strip()


# train["comment_text"] = train["comment_text"].apply(lambda x: clean(x))

# with open("clean.csv", "w+") as f:
#     train.to_csv(f)


In [130]:
clean = pd.read_csv("clean.csv")
clean.dropna(inplace=True)
clean["comment_text"] = clean["comment_text"].str.replace(",", "")

In [134]:
logreg = Pipeline(
    [
        ("vect", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        ("clf", LogisticRegression(solver='liblinear')),
    ]
)

for tox in toxic_labels:
    X_train, X_test, y_train, y_test = train_test_split(
        clean["comment_text"], clean[tox], test_size=0.20, random_state=42
    )
    logreg.fit(X_train, y_train)
    y_pred = logreg.predict(X_test)
    print(f"{accuracy_score(y_test, y_pred)}")


0.9564808427917476
0.9904684266633222
0.979525929641939
0.9973976296482097
0.9707782027967643
0.9924750736815702
