In [29]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_predict, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import gensim
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

In [23]:
df=pd.read_csv('cleaned_text.csv')

In [24]:
df.sample(10)

Unnamed: 0,label,text,word_count,num_stop_words,num_chars,num_punctuation_chars,word_count_after_preprocessing,num_chars_after_preprocessing
13279,spam,husband stayed hotel monaco day october cann...,98,49,512,10,48,318
9840,spam,subject investor communiqup get abzt first th...,1248,362,6760,172,593,4636
851,ham,ill text drop x,8,4,31,1,4,15
8877,spam,subject hard even cuumm not leave hanging toni...,32,11,156,8,15,100
6828,ham,subject union gas thamm tom thamm well came...,143,42,677,15,65,438
12336,ham,url url date not supplied mark two many wonder...,73,25,412,0,48,319
12817,spam,free cd rom lesson url choose number title lea...,215,72,1264,0,144,953
2065,ham,table occupied im waiting tree,8,3,45,3,5,30
9638,ham,subject fw piss side road original message tc...,162,19,631,56,65,408
13597,spam,making obligatory trip chicago area visit fami...,69,30,380,11,38,253


In [30]:
# Preprocessing: Tokenize text and remove unnecessary characters
def preprocess_text(text):
    if isinstance(text, str):  # Ensure text is a string
        # Tokenize the sentence into words
        tokens = word_tokenize(text.lower())  # Lowercase the text
        return tokens
    return []  # Return empty list for None or non-string values

In [31]:
# Apply the preprocessing function to the text column
df['tokenized_text'] = df['text'].apply(preprocess_text)

In [32]:
df.sample(5)

Unnamed: 0,label,text,word_count,num_stop_words,num_chars,num_punctuation_chars,word_count_after_preprocessing,num_chars_after_preprocessing,tokenized_text
12108,ham,url url date not supplied british scientist vo...,25,9,145,0,17,110,"[url, url, date, not, supplied, british, scien..."
6600,spam,subject snap explosion immoderate massive rock...,191,8,1357,23,158,1267,"[subject, snap, explosion, immoderate, massive..."
14528,ham,consistently disappointed throughout stay jame...,299,152,1607,49,149,1025,"[consistently, disappointed, throughout, stay,..."
1215,ham,plan pongal,5,3,26,1,2,11,"[plan, pongal]"
6088,ham,subject tenaska iv please change demand fee ...,50,10,221,10,16,128,"[subject, tenaska, iv, please, change, demand,..."


In [33]:
sentences=df['tokenized_text'].to_list()

In [34]:
len(sentences)

14617

In [35]:
sentences

[['go',
  'jurong',
  'point',
  'crazy',
  'available',
  'bugis',
  'n',
  'great',
  'world',
  'la',
  'e',
  'buffet',
  'cine',
  'got',
  'amore',
  'wat'],
 ['ok', 'lar', 'joking', 'wif', 'u', 'oni'],
 ['free',
  'entry',
  'wkly',
  'comp',
  'win',
  'fa',
  'cup',
  'final',
  'tkts',
  'st',
  'may',
  'text',
  'fa',
  'receive',
  'entry',
  'questionstd',
  'txt',
  'ratetcs',
  'apply',
  'overs'],
 ['u', 'dun', 'say', 'early', 'hor', 'u', 'c', 'already', 'say'],
 ['nah', 'dont', 'think', 'go', 'usf', 'life', 'around', 'though'],
 ['freemsg',
  'hey',
  'darling',
  'week',
  'no',
  'word',
  'back',
  'id',
  'like',
  'fun',
  'still',
  'tb',
  'ok',
  'xxx',
  'std',
  'chgs',
  'send',
  'rcv'],
 ['even',
  'brother',
  'not',
  'like',
  'speak',
  'me',
  'treat',
  'like',
  'aid',
  'patent'],
 ['per',
  'request',
  'melle',
  'melle',
  'oru',
  'minnaminunginte',
  'nurungu',
  'vettam',
  'set',
  'callertune',
  'callers',
  'press',
  'copy',
  'friend',

In [36]:
# Initialize and train the Word2Vec model
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)  # vector_size is the length of the word vectors


In [37]:
#Convert text data to Word2Vec embeddings
def get_sentence_embedding(tokens, model):
    """
    Get the average Word2Vec vector for a sentence (list of tokens).
    If a token is not in the model's vocabulary, it is ignored.
    """
    # Get vectors for words in the model's vocabulary
    valid_words = [model.wv[word] for word in tokens if word in model.wv]
    
    if not valid_words:
        # If no valid words, return a zero vector
        return [0] * model.vector_size
    
    # Calculate the average vector for the sentence
    return list(sum(valid_words) / len(valid_words))



In [38]:
# Apply the function to get Word2Vec embeddings for each row
df['word2vec_embedded_vector'] = df['tokenized_text'].apply(lambda tokens: get_sentence_embedding(tokens, model))



In [40]:
df.sample(4)

Unnamed: 0,label,text,word_count,num_stop_words,num_chars,num_punctuation_chars,word_count_after_preprocessing,num_chars_after_preprocessing,tokenized_text,word2vec_embedded_vector
11287,ham,begin pgp signed message hash shanumber conten...,232,62,1519,0,169,1219,"[begin, pgp, signed, message, hash, shanumber,...","[-0.52670753, 0.043331273, -1.0631641, 0.13854..."
1177,ham,yo game almost over want go walmart soon,11,3,52,2,8,40,"[yo, game, almost, over, want, go, walmart, soon]","[-0.12471256, -0.11388658, -1.1051327, -0.3513..."
13665,spam,traveled chicago husband romantic weekend away...,67,27,392,10,41,281,"[traveled, chicago, husband, romantic, weekend...","[-0.21170694, 0.01254996, -1.3142825, -0.61478..."
4330,ham,hide anythiing keeping distance,9,5,49,0,4,31,"[hide, anythiing, keeping, distance]","[-0.16022897, 0.06158752, -0.4214239, -0.21287..."


In [41]:
df1=df[['label','word_count','num_stop_words','num_chars','num_punctuation_chars','word_count_after_preprocessing','num_chars_after_preprocessing','word2vec_embedded_vector']]

In [42]:
df1

Unnamed: 0,label,word_count,num_stop_words,num_chars,num_punctuation_chars,word_count_after_preprocessing,num_chars_after_preprocessing,word2vec_embedded_vector
0,ham,20,4,111,9,16,82,"[-0.44536713, -0.101509035, -1.050856, -0.2630..."
1,ham,6,0,29,6,6,23,"[-0.33632967, -0.17212276, -0.87967306, -0.000..."
2,spam,28,5,155,5,20,110,"[-0.23400958, -0.0057888003, -0.75428146, 0.01..."
3,ham,11,2,49,6,9,35,"[-0.3609581, -0.032422144, -1.5170497, -0.1118..."
4,ham,13,6,61,2,8,40,"[-0.10801181, -0.021496635, -1.3440485, -0.605..."
...,...,...,...,...,...,...,...,...
14612,ham,131,59,745,24,74,517,"[-0.25193012, -0.063167185, -1.1948055, -0.387..."
14613,ham,121,62,670,18,63,430,"[-0.1882756, -0.060200736, -1.2349558, -0.3945..."
14614,ham,141,72,770,9,69,490,"[-0.237662, -0.07663142, -1.2476673, -0.374296..."
14615,ham,86,47,448,16,41,266,"[-0.34076858, -0.05880893, -1.1384875, -0.3845..."


In [43]:
df1.to_csv('word2vec_embedded.csv',index=False)