In [None]:
from google.colab import files
uploaded = files.upload()


Saving spam.csv to spam.csv


# SMS Spam Detection using Word2Vec and Logistic Regression
This notebook demonstrates how to preprocess SMS messages, use the Google News Word2Vec model to vectorize them, and classify them using Logistic Regression.

In [None]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m66.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━

In [None]:
import pandas as pd
import gensim
import numpy as np
import nltk
import gensim.downloader as api
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

## Load the Dataset

In [None]:
# Load the CSV file (uploaded earlier)
df = pd.read_csv("spam.csv", encoding='latin-1')[['v1', 'v2']]

df.columns = ['Label', 'Message']
df.head()

Unnamed: 0,Label,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Preprocess Messages

In [None]:
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text.lower())
    return [word for word in tokens if word.isalpha() and word not in stop_words]

## Load Google News Word2Vec Model

In [None]:
# This will download ~1.6GB the first time
w2v_model = api.load('word2vec-google-news-300')



## Vectorize Messages by Averaging Word Embeddings

In [None]:
def vectorize_messages(messages, w2v_model, vector_size=300):
    vectors = []
    for message in messages:
        words = preprocess_text(message)
        word_vectors = [w2v_model[word] for word in words if word in w2v_model]
        if word_vectors:
            vectors.append(np.mean(word_vectors, axis=0))
        else:
            vectors.append(np.zeros(vector_size))
    return np.array(vectors)

## Prepare Data and Train Logistic Regression Model

In [None]:
X = vectorize_messages(df['Message'], w2v_model)
y = df['Label'].map({'ham': 0, 'spam': 1}).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on test set: {accuracy:.4f}")

Accuracy on test set: 0.9417


## Predict New Messages

In [None]:
def predict_message_class(model, w2v_model, message):
    words = preprocess_text(message)
    word_vectors = [w2v_model[word] for word in words if word in w2v_model]
    if word_vectors:
        vector = np.mean(word_vectors, axis=0).reshape(1, -1)
    else:
        vector = np.zeros((1, 300))
    prediction = model.predict(vector)
    return 'spam' if prediction[0] == 1 else 'ham'

# Test it
predict_message_class(clf, w2v_model, "Congratulations! You've won a free lottery ticket!")

'spam'