In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, classification_report

In [2]:
train = pd.read_csv("https://raw.githubusercontent.com/dphi-official/Datasets/master/cyberbullying_data/Training_set_label.csv" )

In [3]:
train.head(5)

Unnamed: 0,id,Text,oh_label
0,5.75e+17,@urgedharry @nyazpolitics @greenlinerzjm Then ...,0
1,5.62e+17,RT @RudawEnglish: Dozens of vehicles belonging...,0
2,5.75e+17,@biebervalue @greenlinerzjm Here is the Quran ...,1
3,5.55e+17,@Ceff00 @JosephIsVegan @SumbelinaZ @IronmanL1 ...,1
4,5.76e+17,I would literally kill someone for Jac and Sha...,0


In [4]:
train = train.drop('id', axis=1)

In [5]:
train.head()

Unnamed: 0,Text,oh_label
0,@urgedharry @nyazpolitics @greenlinerzjm Then ...,0
1,RT @RudawEnglish: Dozens of vehicles belonging...,0
2,@biebervalue @greenlinerzjm Here is the Quran ...,1
3,@Ceff00 @JosephIsVegan @SumbelinaZ @IronmanL1 ...,1
4,I would literally kill someone for Jac and Sha...,0


In [6]:
train = train.dropna()

In [7]:
# train['oh_label'] = train.oh_label.map({1:'bullying', 0:'not bullying'})

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train['Text'],train['oh_label'],test_size=0.25,random_state=42)

print('Number of rows in the total set: {}'.format(train.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))

Number of rows in the total set: 15274
Number of rows in the training set: 11455
Number of rows in the test set: 3819


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

In [10]:
X_train_vect.shape

(11455, 17674)

In [11]:
# Training model using Naive bayes classifier

from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb.fit(X_train_vect,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [12]:
y_pred = mnb.predict(X_test_vect)
accuracy_score(y_test,y_pred)

0.8133019114951558

In [13]:
cr = classification_report(y_test, y_pred)
print(cr)

              precision    recall  f1-score   support

           0       0.79      0.98      0.88      2578
           1       0.91      0.47      0.62      1241

    accuracy                           0.81      3819
   macro avg       0.85      0.72      0.75      3819
weighted avg       0.83      0.81      0.79      3819



## RandomForestClassifier

In [23]:
from sklearn.ensemble import RandomForestClassifier
rfr = RandomForestClassifier(n_estimators=200).fit(X_train_vect,y_train)

In [24]:
y_pred = rfr.predict(X_test_vect)
accuracy_score(y_test,y_pred)

0.8837391987431265

### LGBM

In [27]:
from lightgbm import LGBMClassifier
lgb = LGBMClassifier(n_estimators=500).fit(X_train_vect,y_train)

In [28]:
y_pred = lgb.predict(X_test_vect)
accuracy_score(y_test,y_pred)

0.8766692851531814

### SVC

In [30]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train_vect,y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [31]:
y_pred = svc.predict(X_test_vect)
accuracy_score(y_test,y_pred)

0.8929039015449071

## Test Section

In [14]:
# test = pd.read_csv('https://raw.githubusercontent.com/dphi-official/Datasets/master/cyberbullying_data/Testing_set_label.csv')

In [15]:
test.head()

Unnamed: 0,id,Text
0,5.68e+17,I can't explain this. http://t.co/GY1rcVZgbO
1,5.76e+17,“@WomensWeeklyMag: UPDATE: @healthgovau launch...
2,5.76e+17,It would be really funny if they kicked out th...
3,5.72e+17,RT @GrumpyPigeon: I'm thinking #MKR will go pa...
4,5.72e+17,#MKR praying these two bloody bimbos will leav...


In [16]:
test = test.drop('id', axis=1)
test.head()

Unnamed: 0,Text
0,I can't explain this. http://t.co/GY1rcVZgbO
1,“@WomensWeeklyMag: UPDATE: @healthgovau launch...
2,It would be really funny if they kicked out th...
3,RT @GrumpyPigeon: I'm thinking #MKR will go pa...
4,#MKR praying these two bloody bimbos will leav...


In [17]:
testing_data = vectorizer.transform(test['Text'])

In [18]:
testing_data.shape

(5056, 17674)

In [21]:
def download_preds(preds_test, file_name = 'sub.csv'):
    df = pd.DataFrame()
    ## 1. Setting the target column with our obtained predictions
    df['prediction'] = preds_test
    ## 2. Saving our predictions to a csv file
    df.to_csv(file_name, index = False)

In [29]:
preds_test = lgb.predict(testing_data)
download_preds(preds_test, file_name='nlp3.csv')