In [1]:
import pandas as pd

In [2]:
pwd

'C:\\Users\\shrey\\Downloads'

In [3]:
df = pd.read_csv("Preprocessed_cleaned_Final_dataset.csv")

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9310 entries, 0 to 9309
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Text          9310 non-null   object
 1   IsHatespeech  9310 non-null   int64 
 2   tokens        9310 non-null   object
 3   clean_text    9307 non-null   object
 4   text_length   9310 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 363.8+ KB


### Word Tokenization
#### Word tokenization is the process of splitting a string of text into individual words or tokens. It is a fundamental step in natural language processing (NLP) and text mining tasks. By breaking down text into words, it allows for more detailed and granular analysis.

In [5]:
df['tokens'].head()

0    ['frustratingly', 'small', 'please', 'find', '...
1    ['padding', 'cell', 'padding', 'class', 'mainp...
2    ['scenario', 'present', 'see', 'believe', 'peo...
3    ['go', 'inside', 'tomorrow', 'hate', 'every', ...
4    ['retweet', 'china', 'bird', 'flu', 'outbreak'...
Name: tokens, dtype: object

In [6]:
df.head(3)

Unnamed: 0,Text,IsHatespeech,tokens,clean_text,text_length
0,frustratingly small please find an image at le...,0,"['frustratingly', 'small', 'please', 'find', '...",frustratingly small please find image least pi...,117
1,padding cell padding class mainpagebg solid ve...,0,"['padding', 'cell', 'padding', 'class', 'mainp...",padding cell padding class mainpagebg solid ve...,1192
2,from the scenarios you present i see you belie...,0,"['scenario', 'present', 'see', 'believe', 'peo...",scenario present see believe people robot abra...,560


### Word2Vec Embeddings
#### Pros:

#### Captures semantic relationships between words.
#### Creates dense, low-dimensional representations.
#### Effective for deep learning models and tasks requiring word similarity and analogy.

In [7]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import numpy as np


word2vec_model = Word2Vec(sentences=df['tokens'], vector_size=100, window=5, min_count=1, workers=4)

def average_word_vectors(tokens, model, num_features):
    feature_vec = np.zeros((num_features,), dtype="float32")
    n_words = 0
    for word in tokens:
        if word in model.wv:
            n_words += 1
            feature_vec = np.add(feature_vec, model.wv[word])
    if n_words > 0:
        feature_vec = np.divide(feature_vec, n_words)
    return feature_vec

df['word2vec'] = df['tokens'].apply(lambda tokens: average_word_vectors(tokens, word2vec_model, 100))

print("Word2Vec Embeddings:\n", df[['clean_text', 'word2vec']])


Word2Vec Embeddings:
                                              clean_text  \
0     frustratingly small please find image least pi...   
1     padding cell padding class mainpagebg solid ve...   
2     scenario present see believe people robot abra...   
3     go inside tomorrow hate every time feel cheap ...   
4     retweet china bird flu outbreak good sign bad ...   
...                                                 ...   
9305  chastity really quality girl woman grow associ...   
9306  wow like folk riding subway today woman like m...   
9307                                        mean nigger   
9308  let know girl run shit round sexist believe sh...   
9309  gavin williamson still still work department e...   

                                               word2vec  
0     [-0.119547114, 0.017391607, -0.13664296, 0.096...  
1     [-0.11867098, 0.028431285, -0.1392278, 0.09403...  
2     [-0.12058537, 0.03158889, -0.13401146, 0.11639...  
3     [-0.16566682, 0.04605666, -0.15

In [11]:
print(df['word2vec'].iloc[1])

[-0.11867098  0.02843129 -0.1392278   0.09403382 -0.22485183 -0.07969432
  0.4322987   0.30697566  0.03423383  0.21524046  0.19655149  0.3932956
  0.05345165 -0.17323622 -0.3242236   0.09726483 -0.28174773 -0.22195601
  0.00302462  0.68216306  0.08207386  0.11479293  0.18207687  0.14174277
 -0.13433623 -0.3283925   0.31476483 -0.08079311  0.01802469 -0.18855911
  0.04345749 -0.29866287  0.2115429  -0.19996466 -0.0247303   0.0459272
  0.50971526 -0.02311499  0.05786419 -0.21057622 -0.08453827  0.33512622
 -0.13532074 -0.17212331  0.27236104  0.00782552 -0.29949874  0.35092297
  0.27169043 -0.03685095 -0.27259758  0.29573256  0.03451292 -0.3256595
  0.10168194 -0.15174487  0.04713284 -0.24815488  0.098003   -0.10764467
  0.10670893 -0.00379638  0.21655838 -0.1734427  -0.10341261  0.04410021
  0.03060368  0.50089914 -0.2817654  -0.0493636   0.12134843 -0.00117335
  0.17823125 -0.04143201 -0.20624149 -0.252618   -0.33880746 -0.03060926
 -0.17954807 -0.15151249  0.15772684 -0.12859192  0.50

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9310 entries, 0 to 9309
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Text          9310 non-null   object
 1   IsHatespeech  9310 non-null   int64 
 2   tokens        9310 non-null   object
 3   clean_text    9307 non-null   object
 4   text_length   9310 non-null   int64 
 5   word2vec      9310 non-null   object
dtypes: int64(2), object(4)
memory usage: 436.5+ KB


In [12]:
df.to_csv("Embedded_df.csv",index=False)