<a href="https://colab.research.google.com/github/rootdrew27/cyberbullying-ml/blob/main/CatBoost_w_Top2Vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

CatBoost ImplementationC

In [None]:
!pip install catboost
!pip install tensorflow
!pip install top2vec

In [2]:
# data management
import pandas as pd
import numpy as np

# preprocessing
import re

import sklearn
import tensorflow as tf

from top2vec import Top2Vec

from catboost import CatBoostClassifier, Pool, metrics, cv

In [3]:
df = pd.read_csv("./clean_data_2.csv", usecols=[1,2])

In [4]:
df.dropna(axis=0, inplace=True) #drop rows that contain any null values
df.reset_index(drop=True)

Unnamed: 0,tweet_text,cyberbullying_type
0,in other words your food was crapilicious,0
1,why is so white,0
2,<@> a classy whore or more red velvet cupcakes,0
3,<@> gio meh p thanks for the heads up but not ...,0
4,<@> this is an isis account pretending to be a...,0
...,...,...
43843,black people are not expected to do anything d...,4
43844,turner did not withhold his disappointment tur...,4
43845,i swear to god this dumb nigger bitch i have g...,4
43846,yea fuck you rt <@> if you are a nigger fuckin...,4


In [5]:
def remove_mentions(text):
  return re.sub(r'<@>', r'', text)

# add a mentions count column, and a tweet length column
df['mentions_count'] = [tweet.split().count('<@>') for tweet in df.tweet_text] #number of mentions in tweet
df['tweet_text'] = df['tweet_text'].apply(remove_mentions)
df['tweet_len'] = [len(tweet.split()) for tweet in df.tweet_text] #count words in tweet
df = df[df['tweet_len'] >= 2]
df.head()

Unnamed: 0,tweet_text,cyberbullying_type,mentions_count,tweet_len
0,in other words your food was crapilicious,0,0,7
1,why is so white,0,0,4
2,a classy whore or more red velvet cupcakes,0,1,8
3,gio meh p thanks for the heads up but not too...,0,1,18
4,this is an isis account pretending to be a ku...,0,1,17


# Generate topics using Top2Vec

In [6]:
t2v = Top2Vec(documents=df['tweet_text'].to_list(), embedding_model='doc2vec', speed='deep-learn')

2024-04-01 00:28:20,278 - top2vec - INFO - Pre-processing documents for training
INFO:top2vec:Pre-processing documents for training
2024-04-01 00:28:23,484 - top2vec - INFO - Creating joint document/word embedding
INFO:top2vec:Creating joint document/word embedding
2024-04-01 00:56:49,761 - top2vec - INFO - Creating lower dimension embedding of documents
INFO:top2vec:Creating lower dimension embedding of documents
2024-04-01 00:58:04,378 - top2vec - INFO - Finding dense areas of documents
INFO:top2vec:Finding dense areas of documents
2024-04-01 00:58:12,252 - top2vec - INFO - Finding topics
INFO:top2vec:Finding topics


In [7]:
t2v.save("Top2Vec") # save the model
# t2v = Top2Vec.load("Top2Vec") # load the model

In [24]:
# Add the topic index to the dataframe
doc_ids = np.arange(0, df['tweet_text'].count(), 1).tolist()

topic_nums, topic_score, topic_words, word_scores = t2v.get_documents_topics(doc_ids, num_topics=1)

df['topic'] = topic_nums

In [25]:
df.head()

Unnamed: 0,tweet_text,cyberbullying_type,mentions_count,tweet_len,topic
0,in other words your food was crapilicious,0,0,7,59
1,why is so white,0,0,4,21
2,a classy whore or more red velvet cupcakes,0,1,8,35
3,gio meh p thanks for the heads up but not too...,0,1,18,25
4,this is an isis account pretending to be a ku...,0,1,17,9


In [None]:
print(t2v.get_topics(10)) # NOTE: the cosine similarity score of words for each topic is fairly low (below 30%)

# Split the Data

In [26]:
X = df.drop('cyberbullying_type', axis=1)
Y = df['cyberbullying_type']

In [27]:
from sklearn.model_selection import train_test_split

#Split 1 - 80/20
x_Train_1, x_Test_1, y_Train_1, y_Test_1 = train_test_split(
    X,
    Y,
    test_size=.2,
    random_state=100,
    shuffle=True
)

#Split 2 - 70/30
x_Train_2, x_Test_2, y_Train_2, y_Test_2 = train_test_split(
    X,
    Y,
    test_size=.3,
    random_state=100,
    shuffle=True
)

#Split 3 - 60/40
x_Train_3, x_Test_3, y_Train_3, y_Test_3 = train_test_split(
    X,
    Y,
    test_size=.4,
    random_state=100,
    shuffle=True
)

# Training and Testing

---



In [28]:
#Try pooling

train_pool_1 = Pool(data=x_Train_1,
                  label=y_Train_1,
                  text_features=['tweet_text']
                  )
test_pool_1 = Pool(data=x_Test_1,
                 label=y_Test_1,
                 text_features=['tweet_text'])

train_pool_2 = Pool(data=x_Train_2,
                  label=y_Train_2,
                  text_features=['tweet_text']
                  )
test_pool_2 = Pool(data=x_Test_2,
                 label=y_Test_2,
                 text_features=['tweet_text'])

train_pool_3 = Pool(data=x_Train_3,
                  label=y_Train_3,
                  text_features=['tweet_text']
                  )
test_pool_3 = Pool(data=x_Test_3,
                 label=y_Test_3,
                 text_features=['tweet_text'])

In [29]:
def fit_model(train_pool, **kwargs):
    model = CatBoostClassifier(
        iterations=100,
        learning_rate=0.1,
        eval_metric='MultiClass',
        **kwargs
    )

    return model.fit(train_pool)

model1 = fit_model(train_pool_1)
model2 = fit_model(train_pool_2)
model3 = fit_model(train_pool_3)

0:	learn: 1.5000945	total: 2.47s	remaining: 4m 4s
1:	learn: 1.3286903	total: 4.24s	remaining: 3m 27s
2:	learn: 1.2035688	total: 6.14s	remaining: 3m 18s
3:	learn: 1.1018824	total: 8.48s	remaining: 3m 23s
4:	learn: 1.0210627	total: 11.1s	remaining: 3m 30s
5:	learn: 0.9593787	total: 12.9s	remaining: 3m 21s
6:	learn: 0.9022285	total: 14.7s	remaining: 3m 15s
7:	learn: 0.8566447	total: 16.6s	remaining: 3m 10s
8:	learn: 0.8170475	total: 18.4s	remaining: 3m 6s
9:	learn: 0.7844974	total: 20.4s	remaining: 3m 3s
10:	learn: 0.7521221	total: 23.4s	remaining: 3m 9s
11:	learn: 0.7256577	total: 25.3s	remaining: 3m 5s
12:	learn: 0.7006520	total: 27.2s	remaining: 3m 1s
13:	learn: 0.6790634	total: 29.1s	remaining: 2m 58s
14:	learn: 0.6576060	total: 31.1s	remaining: 2m 56s
15:	learn: 0.6393843	total: 33s	remaining: 2m 53s
16:	learn: 0.6252342	total: 35.8s	remaining: 2m 54s
17:	learn: 0.6106070	total: 38s	remaining: 2m 52s
18:	learn: 0.5975538	total: 39.9s	remaining: 2m 50s
19:	learn: 0.5851628	total: 41.9

In [61]:
y_pred_1 = [pred[0] for pred in model1.predict(test_pool_1)]
y_pred_2 = [pred[0] for pred in model2.predict(test_pool_2)]
y_pred_3 = [pred[0] for pred in model3.predict(test_pool_3)]

In [66]:
from catboost.utils import eval_metric


y_Test_1 = np.array(y_Test_1)
y_Test_2 = np.array(y_Test_2)
y_Test_3 = np.array(y_Test_3)

y_pred_1 = np.array(y_pred_1)
y_pred_2 = np.array(y_pred_2)
y_pred_3 = np.array(y_pred_3)

acc1 = eval_metric(y_Test_1, y_pred_1, 'Accuracy')
acc2 = eval_metric(y_Test_2, y_pred_2, 'Accuracy')
acc3 = eval_metric(y_Test_3, y_pred_3, 'Accuracy')

prec1 = eval_metric(y_Test_1, y_pred_1, 'Precision')
prec2 = eval_metric(y_Test_2, y_pred_2, 'Precision')
prec3 = eval_metric(y_Test_3, y_pred_3, 'Precision')


F1_1 = eval_metric(y_Test_1, y_pred_1, 'F1')
F1_2 = eval_metric(y_Test_2, y_pred_2, 'F1')
F1_3 = eval_metric(y_Test_3, y_pred_3, 'F1')


recall_1 = eval_metric(y_Test_1, y_pred_1, 'Recall')
recall_2 = eval_metric(y_Test_2, y_pred_2, 'Recall')
recall_3 = eval_metric(y_Test_3, y_pred_3, 'Recall')

In [63]:
print(acc1)
print(acc2)
print(acc3)

[0.8673761299919899]
[0.8673329264571255]
[0.8692567374263318]


In [67]:
print(prec1)
print(prec2)
print(prec3)

[0.91208360548661]
[0.9123724823437092]
[0.9126823206226699]


In [68]:
print(F1_1)
print(F1_2)
print(F1_3)

[0.9233617668452027]
[0.9232805399920591]
[0.9243202066704204]


In [69]:
print(recall_1)
print(recall_2)
print(recall_3)

[0.9349223352972683]
[0.9344525808180032]
[0.9362587224906066]
