In [310]:
# Installs the gensim library for word embedding.
!pip install gensim



In [311]:
# Imports necessary libraries for data manipulation (pandas, numpy), plotting (matplotlib, seaborn), and enables inline plotting.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import gensim

In [312]:
# Reads the SMS spam collection dataset into a pandas DataFrame.
msg = pd.read_csv('/content/SMSSpamCollection.txt', sep='\t', names =[ "label" ,"message"  ])

In [313]:
# Imports libraries for regular expressions and natural language processing.
import re
import nltk

In [314]:
# Imports necessary components from nltk for text preprocessing and initializes the lemmatizer.
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [315]:
# Downloads the wordnet corpus from nltk data.
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [316]:
# Preprocesses the text data by cleaning, tokenizing, and lemmatizing.
corpus =[]
for i in range(0 ,len(msg)):
  review = re.sub('[^a-zA-Z]', ' ',msg['message'][i])
  review = review.lower()
  review = review.split()
  review =[lemmatizer.lemmatize(word) for word in review ]
  review = ' '.join(review)
  corpus.append(review)

In [317]:
# This cell appears to have incomplete or incorrect code for filtering or inspecting the corpus.
#[for i,j,k in zip(list(map(len(corpus)),corpus,msg['message'])  if i<1])

In [318]:
# Imports sentence tokenization and simple preprocessing utilities from gensim and nltk.
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [319]:
# Tokenizes the sentences in the corpus into words for Word2Vec training.
words = []
for sent in corpus:
  sent_token = sent_tokenize(sent)
  for sent in sent_token:
    words.append(simple_preprocess(sent))

In [320]:
# Downloads the punkt tokenizer models from nltk data.
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [321]:
# Downloads the punkt_tab tokenizer models from nltk data.
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [322]:
# Trains a Word2Vec model on the tokenized words.
model = gensim.models.Word2Vec(words)

In [323]:
# Displays the vocabulary (indexed words) of the trained Word2Vec model.
model.wv.index_to_key

['to',
 'you',
 'the',
 'it',
 'and',
 'in',
 'is',
 'me',
 'my',
 'for',
 'your',
 'call',
 'of',
 'that',
 'have',
 'on',
 'now',
 'are',
 'can',
 'so',
 'but',
 'not',
 'or',
 'we',
 'do',
 'get',
 'at',
 'ur',
 'will',
 'if',
 'be',
 'with',
 'no',
 'just',
 'this',
 'gt',
 'lt',
 'go',
 'how',
 'up',
 'when',
 'ok',
 'day',
 'what',
 'free',
 'from',
 'all',
 'out',
 'know',
 'll',
 'come',
 'like',
 'good',
 'time',
 'am',
 'then',
 'got',
 'wa',
 'there',
 'he',
 'love',
 'text',
 'only',
 'want',
 'send',
 'one',
 'need',
 'txt',
 'today',
 'by',
 'going',
 'don',
 'stop',
 'home',
 'she',
 'about',
 'lor',
 'sorry',
 'see',
 'still',
 'mobile',
 'take',
 'back',
 'da',
 'reply',
 'dont',
 'our',
 'think',
 'tell',
 'week',
 'hi',
 'phone',
 'they',
 'new',
 'please',
 'later',
 'pls',
 'any',
 'her',
 'ha',
 'co',
 'did',
 'been',
 'msg',
 'min',
 'some',
 'an',
 'night',
 'make',
 'dear',
 'who',
 'here',
 'message',
 'say',
 'well',
 'where',
 're',
 'thing',
 'much',
 'oh',

In [325]:
# Displays the number of documents (sentences) used to train the Word2Vec model.
model.corpus_count

5569

In [326]:
# Displays the shape of the vector representation for the word 'good'.
model.wv['good'].shape

(100,)

In [327]:
# Defines a function to compute the average Word2Vec vector for a document.
def avg_word2vec(doc):
  return np.mean([model.wv[word] for word in doc if word in model.wv.index_to_key] , axis =0)

In [328]:
# Installs the tqdm library for displaying progress bars.
!pip install tqdm



In [329]:
# Imports the tqdm function for creating progress bars.
from tqdm import tqdm

In [330]:
# Applies the avg_word2vec function to each list of words in the 'words' corpus to get document vectors.
x = []
for i in tqdm(range(len(words))):
  x.append(avg_word2vec(words[i]))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 5569/5569 [00:00<00:00, 5823.87it/s]


In [331]:
# Displays the list of average Word2Vec vectors (x).
x

[array([-0.185, 0.203, 0.0556, 0.0152, 0.0898, -0.506, 0.171, 0.516, -0.25, -0.16, -0.189, -0.381, -0.137, 0.0515, 0.179, -0.152, 0.111, -0.351, -0.0321, -0.492, 0.163, 0.000641, 0.126, -0.188, 0.00312, 0.0197, -0.246, -0.229, -0.286, 0.0391, 0.343, -0.0244, 0.104, -0.272, -0.114, 0.345, 0.0106, -0.147, -0.11, -0.497, 0.0594, -0.287, -0.151, -0.0505, 0.197, -0.0557, -0.0544, -0.0434, 0.236, 0.148, 0.23, -0.225, 0.0244, 0.0354, -0.14, 0.0892, 0.218, 0.135, -0.372, 0.147, 0.0741, 0.0693, -0.0771, -0.125, -0.246, 0.23, 0.0974, 0.244, -0.316, 0.347, -0.22, 0.172, 0.367, -0.0781, 0.358, 0.0635, -0.00932, -0.0879, -0.135, 0.119, -0.233, 0.0112, -0.225, 0.395, -0.0908, -0.0594, 0.108, 0.231, 0.348, 0.00151, 0.364, 0.11, 0.0487, 0.0259, 0.487, 0.16, 0.12, -0.238, 0.166, -0.0732], dtype=float32),
 array([-0.178, 0.182, 0.0469, 0.016, 0.0884, -0.453, 0.141, 0.463, -0.224, -0.136, -0.174, -0.34, -0.118, 0.0516, 0.166, -0.144, 0.0996, -0.321, -0.0388, -0.447, 0.153, 0.00134, 0.107, -0.162, 0.00516

In [332]:
# Displays the number of average Word2Vec vectors (length of x).
len(x)

5569

In [335]:
# Filters the original messages and creates the corresponding 'y' labels, then one-hot encodes and converts to a NumPy array.
# Apply the same filtering logic to corpus
filtered_corpus = [c for c in corpus if len(c) > 0]
y = msg[list(map(lambda x: len(x)>0 , corpus))]
# Converts the 'label' column to one-hot encoded format.
y = pd.get_dummies(y['label'])
# Selects the first column of the one-hot encoded labels and converts it to a NumPy array.
y=y.iloc[:,0].values

In [336]:
# Displays the shape of the 'y' array.
y.shape

(5569,)

In [337]:
# Converts the list of average Word2Vec vectors (x) into a pandas DataFrame.
df_list = []

for i in range(len(x)):
    df_list.append(pd.DataFrame(x[i].reshape(1, -1)))

df = pd.concat(df_list, ignore_index=True)

  df = pd.concat(df_list, ignore_index=True)


In [338]:
# Displays the shape of the DataFrame (df).
df.shape

(5569, 100)

In [339]:
# Adds the 'y' labels as a new column named 'Output' to the DataFrame.
df['Output'] =y

In [340]:
# Displays the first few rows of the DataFrame with the added 'Output' column.
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,Output
0,-0.184909,0.202873,0.055642,0.015243,0.089763,-0.505504,0.171071,0.515719,-0.249655,-0.159801,...,0.109909,0.048741,0.025939,0.486846,0.159876,0.119941,-0.237563,0.166148,-0.073242,True
1,-0.178304,0.182094,0.046857,0.016006,0.088373,-0.452711,0.140803,0.463064,-0.223794,-0.136045,...,0.09147,0.035726,0.018341,0.426309,0.140103,0.107305,-0.223628,0.154536,-0.072771,True
2,-0.200763,0.216508,0.063953,0.019753,0.078959,-0.546401,0.162112,0.512683,-0.257767,-0.186746,...,0.110017,0.041665,-0.00062,0.504116,0.146495,0.074295,-0.274232,0.193693,-0.050919,False
3,-0.259462,0.27119,0.068082,0.028447,0.123401,-0.681323,0.223737,0.698076,-0.341161,-0.208092,...,0.142197,0.058123,0.043515,0.645148,0.220608,0.174349,-0.327361,0.225269,-0.107572,True
4,-0.22776,0.222504,0.066594,0.019642,0.111502,-0.574702,0.186598,0.594999,-0.288707,-0.18169,...,0.115488,0.052481,0.038772,0.549742,0.18801,0.14152,-0.286991,0.185321,-0.088098,True


In [341]:
# Removes rows with any NaN values from the DataFrame.
df.dropna(inplace=True)

In [342]:
# Separates the features (x) and labels (y) from the cleaned DataFrame.
x = df.drop('Output', axis=1)
y = df['Output'].values

In [343]:
# Splits the feature data (x) and labels (y) into training and testing sets.
from sklearn.model_selection import train_test_split
x_train ,x_test , y_train , y_test = train_test_split(x,y,test_size=0.30 , random_state=420)

In [344]:
# Displays the first few rows of the training features (x_train).
x_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
141,-0.281167,0.270374,0.078599,0.018371,0.132939,-0.711007,0.226074,0.72445,-0.356009,-0.231963,...,0.524104,0.144398,0.060356,0.029891,0.663185,0.232441,0.162704,-0.346806,0.228201,-0.099332
456,-0.198449,0.198675,0.058497,0.014649,0.096319,-0.499214,0.159408,0.521317,-0.246473,-0.157171,...,0.366687,0.104711,0.038994,0.030008,0.4802,0.169256,0.122121,-0.243307,0.15999,-0.080201
3126,-0.222391,0.217987,0.063677,0.025099,0.117805,-0.580595,0.184703,0.595444,-0.291105,-0.178024,...,0.426564,0.112516,0.041062,0.040935,0.542282,0.19396,0.131593,-0.292393,0.181641,-0.08927
2841,-0.207017,0.214591,0.05767,0.018636,0.103758,-0.536605,0.174908,0.55027,-0.268239,-0.16576,...,0.394064,0.117289,0.048749,0.025343,0.500391,0.178974,0.133584,-0.269429,0.179583,-0.091686
2359,-0.189555,0.202686,0.059381,0.013712,0.079135,-0.517144,0.151864,0.485926,-0.243674,-0.174759,...,0.341624,0.10679,0.041292,0.003047,0.473049,0.14252,0.078017,-0.254003,0.179355,-0.054452


In [345]:
# Displays the training labels (y_train).
y_train

array([ True,  True,  True,  True, False,  True,  True,  True,  True,  True,  True,  True,  True,  True, False,  True,  True,  True, False,  True,  True,  True,  True,  True,  True,  True, False, False,  True,  True, ...,  True,  True,  True,  True,  True,  True, False,  True,  True,  True,  True,  True,  True,  True,  True,  True,  True,  True,  True,  True,  True,  True,  True,  True, False,  True,  True,  True,  True,  True])

In [346]:
# Initializes and trains a RandomForestClassifier model.
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier().fit(x_train , y_train)
rfc

In [347]:
# Makes predictions on the testing features using the trained RandomForestClassifier.
y_pred = rfc.predict(x_test)

In [348]:
# Imports functions for evaluating classification model performance.
from sklearn.metrics import accuracy_score , classification_report

In [349]:
# Calculates and prints the accuracy score of the RandomForestClassifier model.
print(accuracy_score(y_test , y_pred))

0.9694244604316546


In [350]:
# Prints the classification report for the RandomForestClassifier model, showing precision, recall, and f1-score.
print(classification_report(y_test , y_pred))

              precision    recall  f1-score   support

       False       0.92      0.85      0.88       228
        True       0.98      0.99      0.98      1440

    accuracy                           0.97      1668
   macro avg       0.95      0.92      0.93      1668
weighted avg       0.97      0.97      0.97      1668

