# This experiment tests the effectiveness of using synonyms to avoid OOV words

In [19]:
import pandas as pd
import numpy as np
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from xgboost import XGBClassifier
import random


In [4]:
random.seed(115)


In [5]:
def get_synonyms(word):
    a = set([w for syn in wordnet.synsets(word) for w in syn.lemma_names()])
    if word in a: a.remove(word)
    return list(a)


In [34]:
get_synonyms('deplorable')

['sorry',
 'execrable',
 'condemnable',
 'vicious',
 'sad',
 'distressing',
 'reprehensible',
 'lamentable',
 'miserable',
 'criminal',
 'pitiful',
 'wretched',
 'woeful']

In [43]:
TRAIN_DATA_PATH = r'C:\Users\rooty\UWEC\Research\CyberBullyingML\cyberbullyingml\cyberbullying-ml\data\en_only\48000_cyberbullying_tweets_basic_clean.csv'
TEST_DATA_PATH = r'C:\Users\rooty\UWEC\Research\CyberBullyingML\cyberbullyingml\cyberbullying-ml\data\en_only\hatespeech_tweets_basic_clean.csv'

train_df = pd.read_csv(TRAIN_DATA_PATH)
test_df = pd.read_csv(TEST_DATA_PATH)

In [44]:
train_df.dropna(inplace=True)
train_df.drop_duplicates(inplace=True)
train_df.reset_index(inplace=True, drop=True)

test_df.dropna(inplace=True)
test_df.drop_duplicates(inplace=True)
test_df.reset_index(inplace=True, drop=True)

In [45]:
# convert labels to 0 and 1

train_df['label'] = train_df['label'].apply(lambda label: 0 if label == 'notcb' else 1)
test_df['class'] = test_df['class'].apply(lambda label: 1 if label == 0 else 0)

In [48]:
x_train = train_df['tweet']
y_train = train_df['label']

x_test = test_df['tweet']
y_test = test_df['class']

In [92]:
vect = CountVectorizer()
vect.fit_transform([x_train.iloc[2]])



<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 39 stored elements and shape (1, 39)>

In [33]:
vect = CountVectorizer(tokenizer=lambda x: x.split()) 
x_train = vect.fit_transform(x_train)
vocab = [w for w, i in vect.vocabulary_.items()]




In [34]:
'as' in list(vocab)

True

In [35]:
def replace_oov_words(text):
    words = text.split()
    replacements = []
    for word in words:
        if word not in vect.vocabulary_:
            synonyms = get_synonyms(word)
            syns = [1 if synonym in vect.vocabulary_ else 0 for synonym in synonyms]
            if 1 in syns:
                indexes = [i for i, val in enumerate(syns) if val == 1]
                random_index = random.choice(indexes)
                replacements.append((word, synonyms[random_index]))
    for word, replacement in replacements:
        print(f'Replaced {word} with {replacement}')
        text = text.replace(word, replacement)
    return text

x_test = x_test.apply(lambda text: ' '.join([w.lower() for w in text.split()])).apply(replace_oov_words)

Replaced overdosing with overdose
Replaced photographed with snap
Replaced edges with abut
Replaced fosters with further
Replaced dancers with dancer
Replaced dancers with dancer
Replaced beers with beer
Replaced drakes with drake
Replaced scooting with scoot
Replaced adios with bye
Replaced slushes with splash
Replaced slushes with splash
Replaced stark with bleak
Replaced smuggler with runner
Replaced boosting with hike
Replaced loco with nutty
Replaced beefing with beef
Replaced mush with slop
Replaced rentals with rental
Replaced sniff with sniffle
Replaced rollers with roll
Replaced gimped with limp
Replaced violets with violet
Replaced angelique with angelica
Replaced angelique with angelica
Replaced dips with duck
Replaced sniff with sniffle
Replaced oreos with oreo
Replaced tryout with audition
Replaced brownies with imp
Replaced goods with goodness
Replaced trys with taste
Replaced snort with snicker
Replaced fresher with sweet
Replaced batter with hitter
Replaced brownies wit

In [36]:
x_test

0        as a woman you should not complain about clean...
1        boy dats coldtyga dwn bad for cuffin dat hoe i...
2        dawg you ever fuck a bitch and she start to cr...
3                                   she look like a tranny
4        the shit you hear about me might be true or it...
                               ...                        
22503    you are such a retard i hope you get type 2 di...
22504    yous a muthafin lie right his tl is trash now ...
22505    you have gone and broke the wrong heart baby a...
22506                youu got wild bitches tellin you lies
22507    flick ntac eileen dahlia beautiful color combi...
Name: tweet, Length: 22508, dtype: object

In [37]:
x_test = vect.transform(x_test)

In [38]:
pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1])

In [39]:
y_train.value_counts()

label
1    37298
0     6377
Name: count, dtype: int64

In [49]:
def get_OOV_feats(train_data:pd.Series, test_data:pd.Series, print_oov_feats:bool=False):
    v1 = CountVectorizer()
    v2 = CountVectorizer()

    v1.fit(train_data)
    v2.fit(test_data)

    feats_train = v1.get_feature_names_out()
    feats_test = v2.get_feature_names_out()

    oov_feats = np.setdiff1d(feats_test, feats_train)
    if print_oov_feats: print(f"OOV features: {oov_feats}")

    return oov_feats

In [51]:
get_OOV_feats(x_train, x_test)

array(['02', '0300', '08', ..., 'zulu', 'zzzquil', 'zzzzzz'], dtype=object)

In [40]:
m = XGBClassifier(scale_pos_weight=pos_weight)
m.fit(x_train, y_train)
preds = m.predict(x_test)

In [41]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.94      0.48      0.64     21262
           1       0.05      0.44      0.09      1246

    accuracy                           0.48     22508
   macro avg       0.49      0.46      0.36     22508
weighted avg       0.89      0.48      0.61     22508



In [28]:
vect2 = CountVectorizer(tokenizer=lambda x: x.split())
x_train2 = vect2.fit_transform(x_train)
x_test2 = vect2.transform(x_test)
pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1])
y_train.value_counts()
m2 = XGBClassifier(scale_pos_weight=pos_weight)
m2.fit(x_train2, y_train)
preds2 = m2.predict(x_test2)
print(classification_report(y_test, preds2))




              precision    recall  f1-score   support

           0       0.94      0.48      0.64     21262
           1       0.05      0.44      0.09      1246

    accuracy                           0.48     22508
   macro avg       0.49      0.46      0.36     22508
weighted avg       0.89      0.48      0.61     22508

