# Cyberbullying model using XGBoost, Random Forest and SVC

In [36]:
import pandas as pd
import numpy as np
import string
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

## Preprocessing the dataset

In [2]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
stop_words.update(list(string.punctuation))

In [3]:
df = pd.read_csv("anti-bully-data.csv")
df.head()

Unnamed: 0,label_bullying,text_message
0,0,yeah I got 2 backups for all that. I just hate...
1,0,I hate using my BB but love my iPhone. Haven'...
2,1,Get fucking real dude.
3,1,She is as dirty as they come and that crook ...
4,1,why did you fuck it up. I could do it all day...


In [16]:
messages = df['text_message']
y = df['label_bullying']

In [17]:
def get_simple_pos(tag) :
    if tag.startswith('J') :
        return wordnet.ADJ
    elif tag.startswith('V') :
        return wordnet.VERB
    elif tag.startswith('N') :
        return wordnet.NOUN
    elif tag.startswith('R') :
        return wordnet.ADV
    else:
        return wordnet.NOUN

def clean_text(review) :
    global max_len
    words = word_tokenize(review)
    output_words = []
    for word in words :
        if word.lower() not in stop_words :
            pos = pos_tag([word])
            clean_word = lemmatizer.lemmatize(word,pos = get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    max_len = max(max_len, len(output_words))
    return " ".join(output_words)

In [18]:
max_len = 0

In [19]:
messages = [clean_text(message) for message in messages]
print(messages[0])

yeah get 2 backup hate happen strugglin week ... handle tho


In [22]:
X_train, X_test, Y_train, Y_test = train_test_split(messages, y, random_state = 0, test_size = 0.15)
len(X_train)

7494

In [23]:
CountVec = CountVectorizer(max_df = 0.8,min_df = 0.0005)
X_train = CountVec.fit_transform(X_train)
X_test = CountVec.transform(X_test)

## The XGBoost model

In [31]:
xgb = XGBClassifier()
xgb.fit(X_train, Y_train)
print(xgb.score(X_train, Y_train))
print(xgb.score(X_test, Y_test))

0.7556712036295703
0.7135298563869993


## The Random Forest model

In [32]:
rfc = RandomForestClassifier()
rfc.fit(X_train, Y_train)
print(rfc.score(X_train, Y_train))
print(rfc.score(X_test, Y_test))



0.9603682946357086
0.6681783824640968


## The Support Vector Classifier model

In [35]:
svc = SVC(kernel = 'rbf')
svc.fit(X_train, Y_train)
print(svc.score(X_train, Y_train))
print(svc.score(X_test, Y_test))



0.7160394982652789
0.7150415721844293


In [38]:
params = {'C' : [1e2,1e3,5e3,1e4,5,5e4,1e5],'gamma':[1e-3,5e-4,1e-4,5e-3]}
grid  = GridSearchCV(svc,params)
grid.fit(X_train, Y_train)
print(grid.score(X_train, Y_train))
print(grid.score(X_test, Y_test))



0.7227115025353616
0.7142857142857143


# Accuracy of each model
### 1. XGBoost: 71.35%
### 2. Random Forest:66.82%
### 3. SVC: 71.50
