### Text Classification With Word2vec And AvgWord2vec

In [1]:
import pandas as pd
df = pd.read_csv("spam.csv",encoding="latin1")
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [2]:
# Rename 'v1' to 'label' and 'v2' to 'Message'
df.rename(columns={'v1': 'label', 'v2': 'Message'}, inplace=True)

# Drop the unwanted columns
df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace=True)

# Display the updated DataFrame
print(df.head())

  label                                            Message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


### Data Preprocessing and Cleaning

In [3]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
lz = WordNetLemmatizer()
from nltk.stem import SnowballStemmer
sb = SnowballStemmer('english')
from nltk.stem import PorterStemmer
pt = PorterStemmer()

In [4]:
corpus = []
for i in range(0,len(df)):
    review = re.sub('[^a-zA-Z]',' ',df['Message'][i])
    review = review.lower()
    review =review.split()
    review = [lz.lemmatize(word)for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [5]:
corpus

['go jurong point crazy available bugis n great world la e buffet cine got amore wat',
 'ok lar joking wif u oni',
 'free entry wkly comp win fa cup final tkts st may text fa receive entry question std txt rate c apply',
 'u dun say early hor u c already say',
 'nah think go usf life around though',
 'freemsg hey darling week word back like fun still tb ok xxx std chgs send rcv',
 'even brother like speak treat like aid patent',
 'per request melle melle oru minnaminunginte nurungu vettam set callertune caller press copy friend callertune',
 'winner valued network customer selected receivea prize reward claim call claim code kl valid hour',
 'mobile month u r entitled update latest colour mobile camera free call mobile update co free',
 'gonna home soon want talk stuff anymore tonight k cried enough today',
 'six chance win cash pound txt csh send cost p day day tsandcs apply reply hl info',
 'urgent week free membership prize jackpot txt word claim c www dbuk net lccltd pobox ldnw rw'

### gensim model used for word to vector conversion 

In [6]:
import gensim
print(gensim.__version__)

4.3.3


In [7]:
import gensim
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

##### simple_preprocess()
###### Convert a document into a list of lowercase tokens, ignoring tokens that are too short or too long.

In [8]:
words = []
for sent in corpus:
    sent_token = sent_tokenize(sent)
    for sent in sent_token:
        words.append(simple_preprocess(sent))

In [9]:
words

[['go',
  'jurong',
  'point',
  'crazy',
  'available',
  'bugis',
  'great',
  'world',
  'la',
  'buffet',
  'cine',
  'got',
  'amore',
  'wat'],
 ['ok', 'lar', 'joking', 'wif', 'oni'],
 ['free',
  'entry',
  'wkly',
  'comp',
  'win',
  'fa',
  'cup',
  'final',
  'tkts',
  'st',
  'may',
  'text',
  'fa',
  'receive',
  'entry',
  'question',
  'std',
  'txt',
  'rate',
  'apply'],
 ['dun', 'say', 'early', 'hor', 'already', 'say'],
 ['nah', 'think', 'go', 'usf', 'life', 'around', 'though'],
 ['freemsg',
  'hey',
  'darling',
  'week',
  'word',
  'back',
  'like',
  'fun',
  'still',
  'tb',
  'ok',
  'xxx',
  'std',
  'chgs',
  'send',
  'rcv'],
 ['even', 'brother', 'like', 'speak', 'treat', 'like', 'aid', 'patent'],
 ['per',
  'request',
  'melle',
  'melle',
  'oru',
  'minnaminunginte',
  'nurungu',
  'vettam',
  'set',
  'callertune',
  'caller',
  'press',
  'copy',
  'friend',
  'callertune'],
 ['winner',
  'valued',
  'network',
  'customer',
  'selected',
  'receivea',
 

In [10]:
## Train our Word2Vec model
import gensim
model = gensim.models.Word2Vec(sentences=words, vector_size=100, window=5, min_count=1, workers=4)

In [11]:
print("Vocabulary size:", len(model.wv.index_to_key))

Vocabulary size: 7000


In [12]:
# Check the corpus count (number of sentences used in training)
print("Number of sentences used for training:", model.corpus_count)

Number of sentences used for training: 5564


In [13]:
model.epochs

5

In [14]:
model.wv.similar_by_word('good')

[('get', 0.9996623992919922),
 ('day', 0.9996502995491028),
 ('going', 0.9996358752250671),
 ('go', 0.9996338486671448),
 ('need', 0.999631404876709),
 ('much', 0.9996204972267151),
 ('got', 0.9996193647384644),
 ('want', 0.9996174573898315),
 ('last', 0.9996129274368286),
 ('love', 0.9995866417884827)]

In [15]:
model.wv['good'].shape

(100,)

In [16]:
vocabulary = set(model.wv.index_to_key)
def avg_word2vec(doc):
    # Convert generator to a list before computing mean
    vectors = [model.wv[word] for word in doc if word in vocabulary]
    if vectors:  # Ensure there are valid vectors to compute mean
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)  # Return a zero vector if no words are in the vocabulary

In [17]:
import numpy as np
X = []
for i in range(len(words)):
    X.append(avg_word2vec(words[i]))

In [18]:
X

[array([-0.09921931,  0.18664375,  0.02273461, -0.00734869, -0.009332  ,
        -0.24551101,  0.05735633,  0.37138054, -0.1881552 , -0.07047191,
        -0.11365795, -0.31849512,  0.00401605,  0.09237561,  0.08308922,
        -0.12875429,  0.00487332, -0.2557651 , -0.01167506, -0.29734755,
         0.12172519,  0.09048425,  0.04277093, -0.07871895, -0.10768671,
         0.02599582, -0.11684384, -0.11285295, -0.17783244, -0.006338  ,
         0.15730672,  0.02510151,  0.00371873, -0.11982765, -0.07131241,
         0.17965353,  0.01781662, -0.1436274 , -0.09651916, -0.35865885,
         0.02896593, -0.15491556, -0.08288794,  0.00891151,  0.16288865,
        -0.07734079, -0.147913  ,  0.00147562,  0.08367565,  0.12892683,
         0.08670188, -0.17420624, -0.04672887, -0.00653089, -0.14222674,
         0.10217095,  0.13701563, -0.02815722, -0.19021773,  0.06352683,
         0.03124337,  0.09515809, -0.07496464,  0.01442155, -0.18588556,
         0.16619763,  0.07806679,  0.11052433, -0.2

In [19]:
len(X)

5564

In [20]:
## Independent Feature
X_new = np.array(X)

In [21]:
X_new.shape

(5564, 100)

In [22]:
X_new[0].shape

(100,)

In [23]:
### Output Feature
y = pd.get_dummies(df['label'])
y = y.iloc[:,0].values

In [24]:
y.shape

(5572,)

In [25]:
df.shape

(5572, 2)

In [26]:
print("Feature matrix shape:", X_new.shape)  # Should be (5564, 100)
print("Label shape:", y.shape)

Feature matrix shape: (5564, 100)
Label shape: (5572,)


In [27]:
# Ensure the number of labels matches the number of samples in X_new
y = y[:len(X_new)]  # Trim the labels to match the number of samples in X_new

# Check the shapes again
print("Feature matrix shape:", X_new.shape)  # (5564, 100)
print("Label shape:", y.shape)  # (5564,)

Feature matrix shape: (5564, 100)
Label shape: (5564,)


In [29]:
# Step 1: Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=42)

# Step 2: Initialize and train the model
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Step 3: Evaluate the model
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 84.64%
              precision    recall  f1-score   support

       False       0.07      0.01      0.01       158
        True       0.86      0.99      0.92       955

    accuracy                           0.85      1113
   macro avg       0.46      0.50      0.46      1113
weighted avg       0.74      0.85      0.79      1113

Confusion Matrix:
 [[  1 157]
 [ 14 941]]
