### **Word2Vec Encoding Method**

In [1]:
import pandas as pd

# Load the dataset
# Replace with the actual path to your CSV file
df = pd.read_csv('/content/cleaned_dataset.csv')

# Display the first few rows to verify
print("Original DataFrame:")
print(df.head())




Original DataFrame:
   Unnamed: 0  count  hate_speech  offensive_language  neither  class  \
0           0      3            0                   0        3      2   
1           1      3            0                   3        0      1   
2           2      3            0                   3        0      1   
3           3      3            0                   2        1      1   
4           4      6            0                   6        0      1   

                                               tweet  \
0  retweet as a woman you should not complain abo...   
1  retweet boy dats coldtyga down bad for cuffin ...   
2  retweet dawg retweet you ever fuck a bitch and...   
3                     retweet she look like a tranny   
4  retweet the shit you hear about me might be tr...   

                                        tweet_tokens  
0  ['retweet', 'woman', 'complain', 'cleaning', '...  
1  ['retweet', 'boy', 'dat', 'coldtyga', 'bad', '...  
2  ['retweet', 'dawg', 'retweet', 'ever

In [2]:
import ast

# Convert string representation of list to actual list
df['tweet_tokens'] = df['tweet_tokens'].apply(ast.literal_eval)

# Display the first few rows to verify
print("\nDataFrame after converting tweet_tokens to lists:")
print(df.head())



DataFrame after converting tweet_tokens to lists:
   Unnamed: 0  count  hate_speech  offensive_language  neither  class  \
0           0      3            0                   0        3      2   
1           1      3            0                   3        0      1   
2           2      3            0                   3        0      1   
3           3      3            0                   2        1      1   
4           4      6            0                   6        0      1   

                                               tweet  \
0  retweet as a woman you should not complain abo...   
1  retweet boy dats coldtyga down bad for cuffin ...   
2  retweet dawg retweet you ever fuck a bitch and...   
3                     retweet she look like a tranny   
4  retweet the shit you hear about me might be tr...   

                                        tweet_tokens  
0  [retweet, woman, complain, cleaning, house, an...  
1  [retweet, boy, dat, coldtyga, bad, cuffin, dat...  
2  [retw

In [3]:
from gensim.models import Word2Vec

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=df['tweet_tokens'], vector_size=100, window=5, min_count=1, workers=4)

# Get the vector size
vector_size = word2vec_model.wv.vector_size



In [4]:
import numpy as np

def document_vector(doc):
    # Remove out-of-vocabulary words
    doc = [word for word in doc if word in word2vec_model.wv.key_to_index]
    # If the document is empty, return a zero vector
    if not doc:
        return np.zeros(vector_size)
    # Calculate the mean vector
    return np.mean(word2vec_model.wv[doc], axis=0)

# Apply the function to transform tweet_tokens
X_vectors = df['tweet_tokens'].apply(document_vector).tolist()
X_vectors = np.array(X_vectors)

# Display the first few rows to verify
print("\nVectorized DataFrame:")
print(X_vectors)
print(X_vectors.shape)




Vectorized DataFrame:
[[-0.1281698   0.42272925  0.04509426 ... -0.60607636  0.1591309
  -0.57312489]
 [-0.0891434   0.36997801 -0.00723159 ... -0.51402587  0.16330306
  -0.51962775]
 [-0.12536888  0.43876806 -0.01209868 ... -0.58585697  0.18756148
  -0.58029068]
 ...
 [ 0.03422692  0.30399019 -0.01226119 ... -0.56724137  0.31995553
  -0.85040873]
 [-0.05223104  0.2863552   0.00428885 ... -0.38134488  0.10953305
  -0.36331376]
 [-0.06535207  0.18472365  0.02876179 ... -0.25548741  0.05174784
  -0.21488836]]
(24783, 100)


In [6]:
# Combine Word2Vec features with other features
other_features = df[['tweet_tokens']]
X_combined = np.hstack((other_features, X_vectors))

# Define the target variable
y = df['class']

# Display the first few rows to verify
print("\nCombined DataFrame:")
print(X_combined)
print(X_combined.shape)


Combined DataFrame:
[[list(['retweet', 'woman', 'complain', 'cleaning', 'house', 'andamp', 'man', 'always', 'take', 'trash'])
  -0.12816980481147766 0.4227292537689209 ... -0.6060763597488403
  0.15913090109825134 -0.573124885559082]
 [list(['retweet', 'boy', 'dat', 'coldtyga', 'bad', 'cuffin', 'dat', 'hoe', 'st', 'place'])
  -0.08914340287446976 0.36997801065444946 ... -0.514025866985321
  0.16330306231975555 -0.5196277499198914]
 [list(['retweet', 'dawg', 'retweet', 'ever', 'fuck', 'bitch', 'start', 'cry', 'confused', 'shit'])
  -0.1253688782453537 0.43876805901527405 ... -0.5858569741249084
  0.18756148219108582 -0.580290675163269]
 ...
 [list(['young', 'buck', 'wan', 'na', 'eat', 'dat', 'nigguh', 'like', 'fuckin', 'dis'])
  0.0342269167304039 0.3039901852607727 ... -0.567241370677948
  0.3199555277824402 -0.8504087328910828]
 [list(['youu', 'got', 'wild', 'bitch', 'tellin', 'lie'])
  -0.052231043577194214 0.286355197429657 ... -0.38134488463401794
  0.10953304916620255 -0.36331376