In [26]:
import pickle
import numpy as np 
import pandas as pd
import gensim
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec
import nltk
from nltk import sent_tokenize
import spacy 
import string
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score ,  confusion_matrix , classification_report

In [32]:
df = pd.read_csv("cyberbullying_tweets.csv")
df.head()

Unnamed: 0,tweet_text,cyberbullying_type
0,"In other words #katandandre, your food was cra...",not_cyberbullying
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47692 entries, 0 to 47691
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   tweet_text          47692 non-null  object
 1   cyberbullying_type  47692 non-null  object
dtypes: object(2)
memory usage: 745.3+ KB


In [4]:
df.shape

(47692, 2)

In [29]:
df.isnull().sum()

tweet_text            0
cyberbullying_type    0
dtype: int64

In [18]:
mapping_dict = {
                 'religion':1,
                 'age':2,
                 'gender':3,
                 'ethnicity':4,
                 'other_cyberbullying':5,
                 'not_cyberbullying':0
            }

In [19]:
df['cyberbullying_type'] = df['cyberbullying_type'].map(mapping_dict)

In [5]:
story = df['tweet_text'].to_list()
corpus = []
for i in story:
    raw_sent = sent_tokenize(i)
    for sent in raw_sent:
        corpus.append(simple_preprocess(sent))

In [6]:
corpus[0]

['in', 'other', 'words', 'katandandre', 'your', 'food', 'was', 'crapilicious']

In [7]:
model = Word2Vec(
    window = 5,
    min_count = 2,
    vector_size=300
)

In [8]:
model.build_vocab(corpus)

In [9]:
model.train(corpus , total_examples=model.corpus_count , epochs = model.epochs)

(4229848, 5423970)

In [10]:
model.save('model.kv')

In [11]:
new_df = df

In [12]:
!python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")
stop_words = nlp.Defaults.stop_words
punctuations = string.punctuation

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [13]:
def spacy_tokenizer(sent):
    doc = nlp(sent)
    mytokens = [ word.lemma_.lower().strip() for word in doc]
    mytokens = [word for word in mytokens if word not in stop_words and word not in punctuations]
    return mytokens

def sent_vec(sent,model):
    vector_size = model.vector_size
    model_res = np.zeros(vector_size)
    ctr = 1
    for i in sent:
        if i in model.wv:
            model_res += model.wv[i]
            ctr+=1
    model_res = model_res/ctr
    return model_res

In [14]:
new_df['tweet_text'] = new_df['tweet_text'].apply(spacy_tokenizer)
new_df['tweet_text'] = new_df['tweet_text'].apply(lambda x: sent_vec(x,model))

In [21]:
X = new_df['tweet_text'].to_list()
y = df['cyberbullying_type'].to_list()

In [25]:
X_train , X_test , y_train , y_test = train_test_split(X,y,test_size = 0.20 , random_state =2)
rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42)
rf_classifier.fit(X_train, y_train)
y_pred = rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.78

Classification Report:
              precision    recall  f1-score   support

           0       0.53      0.46      0.49      1589
           1       0.90      0.93      0.91      1575
           2       0.97      0.97      0.97      1598
           3       0.89      0.77      0.83      1606
           4       0.95      0.93      0.94      1634
           5       0.47      0.60      0.53      1537

    accuracy                           0.78      9539
   macro avg       0.79      0.78      0.78      9539
weighted avg       0.79      0.78      0.78      9539


Confusion Matrix:
[[ 731   58   28   55   17  700]
 [  26 1460    2   15   17   55]
 [  16    2 1558    2    2   18]
 [ 125   22    4 1241   15  199]
 [  13   44    4    7 1512   54]
 [ 459   38   18   74   24  924]]


In [38]:
with open('randomForest.pkl','wb') as f:
    pickle.dump(rf_classifier,f)