# Setup

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, GridSearchCV

from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix

# Load data

In [2]:
cleaned_tweets_df = pd.read_csv("cleaned_tweets.csv")
cleaned_tweets_df

Unnamed: 0,clean_text,is_cyberbullying
0,logic nigger smell like shit dumb nigger name ...,1
1,female always used replacement bitch whenever ...,1
2,frost still learning play bgs seems work well,0
3,end first #mkr tony abbott pm,0
4,ouch,1
...,...,...
15932,least understand woman conventionally feminine...,0
15933,lot ball liking nigger tygers dumb as white fr...,1
15934,let get one thing straight free thinking mean ...,1
15935,hope get two working bbc back,0


# Data preprocessing

In [3]:
X = cleaned_tweets_df["clean_text"].values.astype("U")    # "U" for Unicode string
y = cleaned_tweets_df["is_cyberbullying"].values

## train-test split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(X_train.shape, X_test.shape)

(11155,) (4782,)


## Build transformation pipelines

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

transform_pipeline = Pipeline([
    ("count_vect", CountVectorizer()),
    ("tfidf_trans", TfidfTransformer())
])

In [6]:
X_train_tfidf = transform_pipeline.fit_transform(X_train)
X_test_tfidf = transform_pipeline.transform(X_test)

print(X_train_tfidf.shape, X_test_tfidf.shape)

(11155, 16616) (4782, 16616)


# Build model

In [7]:
import time 
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [8]:
xgb_clf = xgb.XGBClassifier(random_state=42)
log_reg = LogisticRegression(solver="lbfgs", random_state=42)
lin_svc = LinearSVC()
gnb = MultinomialNB()
dtc = DecisionTreeClassifier(random_state=42)
rfc = RandomForestClassifier(random_state=42)

classifiers = zip(
    ["XGBClassifier", "LogisticRegression", "MultinomialNB", "DecisionTreeClassifier", "RandomForestClassifier"],
    [xgb_clf, log_reg, gnb, dtc, rfc]
)

In [9]:
end = 0

for label, model in classifiers:
    start = time.time()
    scores = cross_val_score(model, X_train_tfidf, y_train, scoring="accuracy", cv=5)
    elapsed_time = time.time() - start
    print("[%s] accuracy: %0.3f (+/- %0.3f) - %f sec" % (label, scores.mean(), scores.std(), elapsed_time))
    end += elapsed_time

print("Elasped time : %f sec" %  end)

[XGBClassifier] accuracy: 0.846 (+/- 0.009) - 10.811857 sec
[LogisticRegression] accuracy: 0.829 (+/- 0.007) - 0.993662 sec
[MultinomialNB] accuracy: 0.749 (+/- 0.008) - 0.046873 sec
[DecisionTreeClassifier] accuracy: 0.807 (+/- 0.013) - 45.830092 sec
[RandomForestClassifier] accuracy: 0.831 (+/- 0.011) - 721.819416 sec
Elasped time : 779.501899 sec


# Fine-Tune Model

## Grid Search

In [10]:
parameters = {
    "learning_rate": [0.1, 0.3],
    "max_depth": [3, 6, 9],
    "min_child_weight": [1, 2, 4],
    "gamma": [0, 2, 4],
    "subsample": [0.8, 1.0], 
    "colsample_bytree": [0.8, 1.0]
}

grid_search = GridSearchCV(estimator=xgb_clf, param_grid=parameters, scoring="accuracy", cv=5, n_jobs=-1)
grid_search.fit(X_train_tfidf, y_train)

In [11]:
best_score = grid_search.best_score_
best_params = grid_search.best_params_

print("best_score", best_score)
print("best_params", best_params)

best_score 0.8527117884356791
best_params {'colsample_bytree': 1.0, 'gamma': 2, 'learning_rate': 0.1, 'max_depth': 6, 'min_child_weight': 2, 'subsample': 1.0}


In [12]:
df = pd.DataFrame(grid_search.cv_results_)
print(df[["params", "mean_test_score", "rank_test_score"]].sort_values("rank_test_score").head().to_string())

                                                                                                                   params  mean_test_score  rank_test_score
153  {'colsample_bytree': 1.0, 'gamma': 2, 'learning_rate': 0.1, 'max_depth': 6, 'min_child_weight': 2, 'subsample': 1.0}         0.852712                1
159  {'colsample_bytree': 1.0, 'gamma': 2, 'learning_rate': 0.1, 'max_depth': 9, 'min_child_weight': 2, 'subsample': 1.0}         0.852174                2
117  {'colsample_bytree': 1.0, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 6, 'min_child_weight': 2, 'subsample': 1.0}         0.851995                3
189  {'colsample_bytree': 1.0, 'gamma': 4, 'learning_rate': 0.1, 'max_depth': 6, 'min_child_weight': 2, 'subsample': 1.0}         0.851636                4
85   {'colsample_bytree': 0.8, 'gamma': 4, 'learning_rate': 0.1, 'max_depth': 9, 'min_child_weight': 1, 'subsample': 1.0}         0.851457                5


In [13]:
# final_model
final_model = grid_search.best_estimator_
y_pred = final_model.predict(X_test_tfidf)
cm = confusion_matrix(y_test, y_pred)

print(cm)
print(classification_report(y_test, y_pred))

[[2284   65]
 [ 640 1793]]
              precision    recall  f1-score   support

           0       0.78      0.97      0.87      2349
           1       0.97      0.74      0.84      2433

    accuracy                           0.85      4782
   macro avg       0.87      0.85      0.85      4782
weighted avg       0.87      0.85      0.85      4782



# Build full pipeline with the estimator

In [14]:
# final model
params = {    
    'learning_rate': 0.1,
    'max_depth': 6,
    'gamma': 2,
    'min_child_weight': 2,
    'subsample': 1.0,
    'colsample_bytree': 1.0
}
final_model = xgb.XGBClassifier(**params)

In [15]:
# transformation pipeline
transform_pipeline = Pipeline([
    ("count_vect", CountVectorizer()),
    ("tfidf_trans", TfidfTransformer())
])

In [16]:
# full pipeline with estimator
full_pipeline_with_estimator = Pipeline([    
    ("preprocessing", transform_pipeline),
    ("final_model", final_model)
])

full_pipeline_with_estimator.fit(X_train, y_train)

In [17]:
y_pred = full_pipeline_with_estimator.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

print(cm)
print(classification_report(y_test, y_pred))

[[2284   65]
 [ 640 1793]]
              precision    recall  f1-score   support

           0       0.78      0.97      0.87      2349
           1       0.97      0.74      0.84      2433

    accuracy                           0.85      4782
   macro avg       0.87      0.85      0.85      4782
weighted avg       0.87      0.85      0.85      4782



# Save model

In [18]:
import pickle

filename = "final_model.pkl"
pickle.dump(full_pipeline_with_estimator, open(filename, "wb"))

In [21]:
#load model
# loaded_model = pickle.load(open(filename, "rb"))