In [5]:
pip install gensim



In [None]:
import pandas as pd
import numpy as np
import gensim.downloader as api
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

w2v_model = api.load("glove-twitter-100")  # ~25MB, 100-dim vectors

df = pd.read_csv('spam.csv', encoding='latin-1')[['v1', 'v2']]
df.columns = ['label', 'message']

stopwords = set([
    'i','me','my','myself','we','our','ours','ourselves','you','your','yours',
    'yourself','yourselves','he','him','his','himself','she','her','hers',
    'herself','it','its','itself','they','them','their','theirs','themselves',
    'what','which','who','whom','this','that','these','those','am','is','are',
    'was','were','be','been','being','have','has','had','having','do','does',
    'did','doing','a','an','the','and','but','if','or','because','as','until',
    'while','of','at','by','for','with','about','against','between','into',
    'through','during','before','after','above','below','to','from','up','down',
    'in','out','on','off','over','under','again','further','then','once','here',
    'there','when','where','why','how','all','any','both','each','few','more',
    'most','other','some','such','no','nor','not','only','own','same','so',
    'than','too','very','can','will','just','don','should','now'
])

def preprocess(text):
    text = text.lower()
    words = ''.join(c if c.isalpha() else ' ' for c in text).split()
    return [word for word in words if word not in stopwords]

df['tokens'] = df['message'].apply(preprocess)

def vectorize(tokens, model):
    vectors = [model[word] for word in tokens if word in model]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

df['vector'] = df['tokens'].apply(lambda tokens: vectorize(tokens, w2v_model))

X = np.vstack(df['vector'].values)
y = (df['label'] == 'spam').astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
print("Test Accuracy:", accuracy_score(y_test, model.predict(X_test)))

def predict_message_class(model, w2v_model, message):
    tokens = preprocess(message)
    vector = vectorize(tokens, w2v_model).reshape(1, -1)
    pred = model.predict(vector)[0]
    return 'spam' if pred == 1 else 'ham'

print(predict_message_class(model, w2v_model, "Free entry in 2 a weekly comp to win FA Cup final tickets. Text to enter now!"))


Test Accuracy: 0.9479820627802691
spam
