In [60]:
#Relevant libraries
import pandas as pd
import numpy as np
import seaborn as sns
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
import requests

In [2]:
#importing the data
df = pd.read_csv('../data/train_tweets.csv')

In [3]:
df.duplicated().sum()

0

In [4]:
#splitting into train and test datasets
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='cyberbullying_type'), 
                                                    df.cyberbullying_type, test_size=0.3, 
                                                    random_state=47, stratify = df.cyberbullying_type, shuffle=True)

In [5]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(30387, 1) (13024, 1) (30387,) (13024,)


In [6]:
#Encoding the text data
tv = TfidfVectorizer(max_features=5000, ngram_range = (1,3))


In [7]:
# Feature Extraction
X_train_tfidf = tv.fit_transform(X_train['cleaned_tweets'])  # Creating the vocabulary only from the training set to avoid data leakage from 
X_test_tfidf = tv.transform(X_test['cleaned_tweets'])        # the test set.

In [8]:
X_train_tfidf

<30387x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 392829 stored elements in Compressed Sparse Row format>

In [9]:
X_test_tfidf

<13024x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 165610 stored elements in Compressed Sparse Row format>

In [10]:
#Training a XGBoost classifier

#initialize
xgb_cl = xgb.XGBClassifier()

#fit
xgb_cl.fit(X_train_tfidf, y_train)


#predict
y_preds = xgb_cl.predict(X_test_tfidf)

In [11]:
#Classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.69      0.51      0.59      1884
           1       0.92      0.85      0.88      2262
           2       0.96      0.95      0.95      2385
           3       0.98      0.98      0.98      2369
           4       0.99      0.98      0.99      2350
           5       0.58      0.82      0.68      1774

    accuracy                           0.86     13024
   macro avg       0.86      0.85      0.85     13024
weighted avg       0.87      0.86      0.86     13024



In [12]:
#roc_auc score

from sklearn.metrics import roc_auc_score

# Generate class membership probabilities
y_preds_probs = xgb_cl.predict_proba(X_test_tfidf)

roc_auc_score(y_test, y_preds_probs, average="weighted", multi_class="ovr")

0.9806576789537736

In [10]:
#Scaling the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_tfidf_array = X_train_tfidf.toarray()   
X_test_tfidf_array = X_test_tfidf.toarray()     
X_train_tfidf_scaled = scaler.fit_transform(X_train_tfidf_array)  
X_test_tfidf_scaled = scaler.transform(X_test_tfidf_array)

In [None]:
#Training a KNN classifier
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

param_grid = {'n_neighbors':np.arange(2,6)}
knn = KNeighborsClassifier()
knn_cv= GridSearchCV(knn,param_grid,cv=5)
knn_cv.fit(X_train_tfidf_scaled,y_train)


In [16]:
print("Best Score:" + str(knn_cv.best_score_))
print("Best Parameters: " + str(knn_cv.best_params_))

Best Score:0.3788132750465765
Best Parameters: {'n_neighbors': 2}


In [22]:
#fitting a knn classifiers to the entire training data
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(4)

#fit
knn.fit(X_train_tfidf_scaled, y_train)

#predict
y_preds2 = knn.predict(X_test_tfidf_scaled)

In [23]:
#Classification report for knn
from sklearn.metrics import classification_report
print(classification_report(y_test, y_preds2))


              precision    recall  f1-score   support

           0       0.24      0.57      0.34      1884
           1       0.55      0.44      0.49      2262
           2       0.71      0.27      0.39      2385
           3       0.78      0.41      0.54      2369
           4       0.78      0.53      0.63      2350
           5       0.22      0.37      0.27      1774

    accuracy                           0.43     13024
   macro avg       0.55      0.43      0.45     13024
weighted avg       0.57      0.43      0.46     13024



In [24]:
#roc_auc score for knn

from sklearn.metrics import roc_auc_score

# Generate class membership probabilities
y_preds2_probs = knn.predict_proba(X_test_tfidf_scaled)

roc_auc_score(y_test, y_preds2_probs, average="weighted", multi_class="ovr")

0.726635662368493

In [28]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()
#fit
LR.fit(X_train_tfidf, y_train)
#predict
y_preds3 = LR.predict(X_test_tfidf)

print(classification_report(y_test, y_preds3))

# Generate class membership probabilities
y_preds3_probs = LR.predict_proba(X_test_tfidf)
roc_auc_score(y_test, y_preds3_probs, average="weighted", multi_class="ovr")

              precision    recall  f1-score   support

           0       0.66      0.60      0.63      1884
           1       0.92      0.85      0.88      2262
           2       0.95      0.95      0.95      2385
           3       0.95      0.97      0.96      2369
           4       0.97      0.98      0.98      2350
           5       0.64      0.76      0.69      1774

    accuracy                           0.86     13024
   macro avg       0.85      0.85      0.85     13024
weighted avg       0.86      0.86      0.86     13024



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.977826325986286

In [15]:
#Saving the best model

filename = '../models/TweetClassifier.pkl'
pickle.dump(xgb_cl, open(filename, 'wb'))

In [16]:
#Saving the vectorizer

filename = '../models/TFIDFVectorizer.pkl'
pickle.dump(tv, open(filename, 'wb'))

In [62]:
#Testing the REST API for the model

url = 'http://127.0.0.1:5000/'
params ={'query': 'Going to Africa. Hope I don’t get AIDS. Just kidding. I’m white!'}
response = requests.get(url, params)
response.json()

{'prediction': 'other_cyberbullying', 'confidence': '0.375'}

In [61]:

params ={'query': 'Muslims should be punished. We are not doing enough to rid us of those filthy animals.'}
response = requests.get(url, params)
response.json()

{'prediction': 'religion', 'confidence': '0.889'}

In [64]:

params ={'query':'@hotep shut up you nigga!!'}
response = requests.get(url, params)
response.json()

{'prediction': 'ethnicity', 'confidence': '0.849'}