In [1]:
#Relevant libraries
import pandas as pd
import numpy as np
import seaborn as sns
import xgboost as xgb
from sklearn.model_selection import train_test_split


In [2]:
#importing the data

df = pd.read_csv('../data/train_tweets.csv')

In [3]:
#splitting into train and test datasets
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='cyberbullying_type'), 
                                                    df.cyberbullying_type, test_size=0.3, 
                                                    random_state=47, stratify = df.cyberbullying_type)

In [8]:
#Training a XGBoost classifier

#initialize
xgb_cl = xgb.XGBClassifier()

#fit
xgb_cl.fit(X_train, y_train)


#predict
y_preds = xgb_cl.predict(X_test)

In [9]:
#Classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1925
           1       1.00      1.00      1.00      2328
           2       1.00      1.00      1.00      2397
           3       1.00      1.00      1.00      2398
           4       1.00      1.00      1.00      2386
           5       1.00      1.00      1.00      1862

    accuracy                           1.00     13296
   macro avg       1.00      1.00      1.00     13296
weighted avg       1.00      1.00      1.00     13296



In [11]:
#roc_auc score

from sklearn.metrics import roc_auc_score

# Generate class membership probabilities
y_preds_probs = xgb_cl.predict_proba(X_test)

roc_auc_score(y_test, y_preds_probs, average="weighted", multi_class="ovr")

0.9999906654904139

In [4]:
#Training a KNN classifier
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

param_grid = {'n_neighbors':np.arange(10,50)}
knn = KNeighborsClassifier()
knn_cv= GridSearchCV(knn,param_grid,cv=5)
knn_cv.fit(X_train,y_train)


GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': array([10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
       27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
       44, 45, 46, 47, 48, 49])})

In [5]:
print("Best Score:" + str(knn_cv.best_score_))
print("Best Parameters: " + str(knn_cv.best_params_))

Best Score:0.9998710717163577
Best Parameters: {'n_neighbors': 15}


The best value of n_neighbours = 15

In [13]:
#fitting a knn classifiers to the entire training data
knn = KNeighborsClassifier(15)

#fit
knn.fit(X_train, y_train)


#predict
y_preds2 = knn.predict(X_test)

In [14]:
#Classification report for knn
from sklearn.metrics import classification_report
print(classification_report(y_test, y_preds2))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1925
           1       1.00      1.00      1.00      2328
           2       1.00      1.00      1.00      2397
           3       1.00      1.00      1.00      2398
           4       1.00      1.00      1.00      2386
           5       1.00      1.00      1.00      1862

    accuracy                           1.00     13296
   macro avg       1.00      1.00      1.00     13296
weighted avg       1.00      1.00      1.00     13296



In [15]:
#roc_auc score for knn

from sklearn.metrics import roc_auc_score

# Generate class membership probabilities
y_preds2_probs = knn.predict_proba(X_test)

roc_auc_score(y_test, y_preds2_probs, average="weighted", multi_class="ovr")

0.9999999967110988

39053    4
22350    5
41502    4
34556    3
37208    4
        ..
26575    5
40847    4
33214    3
21265    2
24324    5
Name: cyberbullying_type, Length: 13296, dtype: int64


2611     0
6193     0
22975    5
34596    3
35265    3
        ..
2498     0
41236    4
6173     0
6501     1
7200     1
Name: cyberbullying_type, Length: 31024, dtype: int64
