In [1]:
#Training dataset: https://www.kaggle.com/datasets/abdallahwagih/spam-emails

import numpy as np
import pandas as pd
import re
import nltk
import google.colab.drive as drive
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [2]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!pip install gensim



In [4]:
from gensim.models import KeyedVectors
word2vec = KeyedVectors.load('/content/drive/MyDrive/word2vec-google-news.model')

In [5]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [6]:
data = pd.read_csv('spam.csv')
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
def _vectorizer(sentence):
  words = re.sub('[^a-zA-Z]', ' ', sentence).lower().split()
  words = [w for w in words if w in word2vec and w not in stop_words]
  if words:
    return np.mean([word2vec[w] for w in words], axis=0)
  else:
    return np.zeros(word2vec.vector_size)

In [8]:
X = data['Message']
le = LabelEncoder()
y = le.fit_transform(data['Category'])

X_vec = np.array([_vectorizer(s) for s in X])

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)

In [10]:
models = {
    'naive_bayes': GaussianNB(),
    'RandomForest': RandomForestClassifier(n_estimators=500),
    'SVC': SVC(kernel='rbf'),
    'xgb': XGBClassifier()
}

max_acc = 0
sel_model = False
sel_pred = False

for name, model in models.items():
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  ac = accuracy_score(y_test, y_pred)
  print(f'model: {name} acc: {ac}\n')
  if ac > max_acc:
    max_acc = ac
    sel_model = model
    sel_pred = y_pred

print(f'selected model: {sel_model} with acc: {max_acc}\n')

model: naive_bayes acc: 0.8717488789237668

model: RandomForest acc: 0.9713004484304932

model: SVC acc: 0.9883408071748879

model: xgb acc: 0.9775784753363229

selected model: SVC() with acc: 0.9883408071748879



In [11]:
text = 'Subject: Confirm Your Email Now — Dear User, we\'ve noticed unusual activity on your account; to continue uninterrupted access, please verify your identity by clicking the link below within 24 hours: https://secure.company.com/verify — Thank you, IT Support, support@company.com'

X_in_vec = np.array([_vectorizer(text)])
y_p = model.predict(X_in_vec)
print(le.inverse_transform(y_p))

['ham']
