In [2]:
import pandas as pd

path = '/content/SMSSpamCollection'
data = pd.read_csv(path, sep='\t', header=None, names=['label', 'text'])

data.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')

def preprocess(text):
  tokens = word_tokenize(text.lower())
  return tokens

data['tokens'] = data['text'].apply(preprocess)

data.head()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Unnamed: 0,label,text,tokens
0,ham,"Go until jurong point, crazy.. Available only ...","[go, until, jurong, point, ,, crazy, .., avail..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, ..., joking, wif, u, oni, ...]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, so, early, hor, ..., u, c, alrea..."
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, i, do, n't, think, he, goes, to, usf, ,,..."


In [4]:
import numpy as np

In [5]:
from gensim.models import Word2Vec

word2vec_model = Word2Vec(sentences=data['tokens'], vector_size=100, window=5, min_count=1, workers=4)

def get_word2vec_embeddings(tokens, model):
  embeddings = [model.wv[token] for token in tokens if token in model.wv]
  if embeddings:
    return np.mean(embeddings, axis=0)
  else:
    return np.zeros(model.vector_size)

data['embeddings'] = data['tokens'].apply(lambda tokens: get_word2vec_embeddings(tokens, word2vec_model))

data.head()

Unnamed: 0,label,text,tokens,embeddings
0,ham,"Go until jurong point, crazy.. Available only ...","[go, until, jurong, point, ,, crazy, .., avail...","[-0.11553541, 0.31411794, 0.07579137, -0.11887..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, ..., joking, wif, u, oni, ...]","[-0.17795424, 0.4617129, -0.0015213671, -0.204..."
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f...","[-0.04274656, 0.12051169, 0.17492056, -0.03891..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, so, early, hor, ..., u, c, alrea...","[-0.1995806, 0.5382734, 0.031858258, -0.222187..."
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, i, do, n't, think, he, goes, to, usf, ,,...","[-0.19123231, 0.584264, 0.03176028, -0.1928249..."


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

X = np.array(data['embeddings'].tolist())
y = data['label']

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

In [7]:
from sklearn.naive_bayes import GaussianNB

gnb_model = GaussianNB()
gnb_model.fit(X_train, y_train)

In [8]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

y_pred = gnb_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
confusion_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)

print("Accuracy:", accuracy)
print("Confusion Matrix:")
print(confusion_matrix)
print("Classification Report:")
print(class_report)

Accuracy: 0.7309417040358744
Confusion Matrix:
[[688 278]
 [ 22 127]]
Classification Report:
              precision    recall  f1-score   support

         ham       0.97      0.71      0.82       966
        spam       0.31      0.85      0.46       149

    accuracy                           0.73      1115
   macro avg       0.64      0.78      0.64      1115
weighted avg       0.88      0.73      0.77      1115



In [9]:

import joblib

joblib.dump(gnb_model, 'gnb_model.pkl')
joblib.dump(label_encoder, 'label_encoder.pkl')
joblib.dump(word2vec_model, 'word2vec_model.pkl')

['word2vec_model.pkl']

In [20]:
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    return tokens

def get_word2vec_embeddings(tokens, model):
    embeddings = [model.wv[token] for token in tokens if token in model.wv]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(model.vector_size)

def predict_spam_or_ham(text):
    tokens = preprocess_text(text)
    embeddings = get_word2vec_embeddings(tokens, word2vec_model)
    embeddings = embeddings.reshape(1, -1)
    prediction = gnb_model.predict(embeddings)
    label = label_encoder.inverse_transform(prediction)
    return label[0]

In [21]:
input_text = "Free entry in 2 a weekly competition to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate). T&C's apply 08452810075over18's"
print("Prediction:", predict_spam_or_ham(input_text))

Prediction: spam


In [22]:
input_text = "I'll call you later."
print("Prediction:", predict_spam_or_ham(input_text))

Prediction: ham
