In [None]:
import pandas as pd
import numpy as np
import gensim.downloader as api
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

nltk.download('punkt')
nltk.download('stopwords')

In [None]:
df = pd.read_csv("spam.csv", encoding='latin1')
df = df[['v1', 'v2']]
df.columns = ['label', 'message']
df.head()

In [None]:
stop_words = set(stopwords.words('english'))

def preprocess(text):
    tokens = word_tokenize(text.lower())
    filtered = [word for word in tokens if word.isalpha() and word not in stop_words]
    return filtered

df['tokens'] = df['message'].apply(preprocess)
df[['message', 'tokens']].head()

In [None]:

w2v_model = api.load("word2vec-google-news-300")

In [None]:
def get_vector(tokens, model, size=300):
    vec = np.zeros(size)
    count = 0
    for word in tokens:
        if word in model:
            vec += model[word]
            count += 1
    return vec / count if count != 0 else vec

df['vector'] = df['tokens'].apply(lambda x: get_vector(x, w2v_model))

In [None]:
X = np.stack(df['vector'].values)
y = df['label'].map({'ham': 0, 'spam': 1}).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
clf = LogisticRegression()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))

In [None]:
def predict_message_class(model, w2v_model, message):
    tokens = preprocess(message)
    vector = get_vector(tokens, w2v_model).reshape(1, -1)
    prediction = model.predict(vector)
    return 'spam' if prediction[0] == 1 else 'ham'

In [None]:

predict_message_class(clf, w2v_model, "Congratulations! You've won a free iPhone!")