In [1]:
from src import util as util
import numpy as np
import matplotlib.pyplot as plt
import collections
import csv

In [2]:
def load_spam_dataset_csv(csv_path):

    messages = []
    labels = []

    with open(csv_path, 'r', newline='', encoding='utf8') as csv_file:
        reader = csv.reader(csv_file, delimiter=',')

        for message, label in reader:
            messages.append(message)
            labels.append(1 if label == '1' else 0)

    return messages, np.array(labels)

In [3]:
train_messages, train_labels = util.load_spam_dataset('data/train.tsv')
test_messages, test_labels = util.load_spam_dataset('data/test.tsv')
train2_messages, train2_labels = load_spam_dataset_csv('data/emails_new_train.csv')
val_messages, val_labels = util.load_spam_dataset('data/val.tsv')

In [4]:
from gensim.models import Word2Vec
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
# More robust tokenization with error handling
def safe_tokenize(text):
    try:
        return word_tokenize(str(text).lower())
    except:
        return str(text).lower().split()  # Fallback to simple whitespace splitting

tokenized_messages = [safe_tokenize(msg) for msg in train_messages]

In [6]:
# Train Word2Vec model with optimized parameters
print("Training Word2Vec model...")
w2v_model = Word2Vec(
    sentences=tokenized_messages,
    vector_size=100,
    window=5,
    min_count=2,
    workers=4,
    epochs=20,
    sg=1,  # Use skip-gram (better for small datasets)
    hs=0,  # Use negative sampling
    negative=5  # Number of negative samples
)

Training Word2Vec model...


In [7]:
def message_to_vector(message, model, vector_size=100):
    words = safe_tokenize(message)
    word_vectors = []
    for word in words:
        try:
            if word in model.wv:
                word_vectors.append(model.wv[word])
        except AttributeError:
            # Handle case where model isn't properly trained
            pass
    
    if len(word_vectors) > 0:
        return np.mean(word_vectors, axis=0)
    return np.zeros(vector_size)

In [8]:
len(train_messages)

4457

In [9]:
message_to_vector("hello",w2v_model)

array([ 0.06405374,  0.10247273, -0.08413229, -0.01936729, -0.12552986,
       -0.36451343,  0.01740931,  0.7463869 , -0.23184416,  0.04113259,
       -0.06659532, -0.10470779, -0.09590095,  0.24004175,  0.12144645,
        0.04116317,  0.02882743, -0.10803799, -0.28054962, -0.527036  ,
       -0.12217733,  0.01034862, -0.2163644 , -0.1567854 , -0.05089254,
        0.12351648, -0.09881826,  0.01471897, -0.05641815,  0.16753095,
        0.16980436, -0.02056365,  0.14284757,  0.05108713, -0.0401189 ,
        0.16103244, -0.2935561 ,  0.04404325,  0.07926708, -0.02141366,
        0.20403479, -0.08230109, -0.14010862,  0.293341  ,  0.3314357 ,
       -0.21688081, -0.3150358 ,  0.19506751,  0.06765264,  0.05199387,
        0.4192219 ,  0.13766335,  0.09758272, -0.00262893, -0.11987112,
       -0.06391813,  0.31355217, -0.19861668, -0.13932532, -0.11765104,
        0.01603647, -0.04146134,  0.14646047, -0.2999676 ,  0.01217924,
        0.3836628 ,  0.29186645,  0.21757127, -0.6328982 ,  0.03

In [10]:
Xhh_train = np.array([message_to_vector(msg, w2v_model) for msg in train_messages])

already prtrained

In [11]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
from sklearn.decomposition import PCA
import gensim.downloader as api
from gensim.models import KeyedVectors

In [12]:
# Load pre-trained embeddings (choose one)
embeddings = api.load("glove-twitter-100")  # Best for SMS-style text
# embeddings = api.load("word2vec-google-news-300")  # Larger vocabulary

In [13]:
def message_to_vector(messages, embedding_model):
    vectors = []
    for msg in messages:
        words = msg.lower().split()  # Simple tokenizer
        word_vecs = [embedding_model[word] for word in words if word in embedding_model]
        vectors.append(np.mean(word_vecs, axis=0) if word_vecs else np.zeros(100))
    return np.array(vectors)

In [14]:
Train_matrix3 = message_to_vector(train_messages, embeddings)
Test_matrix3 = message_to_vector(test_messages, embeddings)

In [15]:
def getwords(message):
    return message.lower().split()

In [16]:
def h(theta,x):
    return 1 / (1 + np.exp(-np.dot(x, theta)))

In [17]:
def Create_dictionary(messages):
    words = [word for message in messages for word in getwords(message)]
    words_count = collections.Counter(words)
    #word_dic = {(word, count) for word,count in words_count.items() if count>=5}
    freq_word = [word for word,count in words_count.items() if count>=10 and word !='subject:']
    return {word: count for count, word in enumerate(freq_word)}

In [18]:
dicWords = Create_dictionary(train_messages)

In [19]:
def Transform_text(messages, word_dictionary):
    m = len(messages)
    n = len(word_dictionary)
    matrix = np.zeros((m,n), dtype=int)

    word_counts = [collections.Counter(getwords(message)) for message in messages]
    for i in range(m):
        for word, count in word_counts[i].items():
            if word in word_dictionary:
                matrix[i][word_dictionary[word]] += count

    return matrix


In [20]:
train_matrix = Transform_text(train_messages,dicWords)

In [21]:
train_matrix.shape

(4457, 877)

In [22]:
Train_matrix3.shape

(4457, 100)

In [23]:
X_with_bias = np.c_[np.ones(Train_matrix3.shape[0]),Train_matrix3]

In [24]:
class LogisticRegressionWithGD:
    def __init__(self,alpha=0.1,iteration=1000):
        self.alpha = alpha
        self.iteration=iteration
        self.theta=None
    def h(self,theta, x):
        return 1 / (1 + np.exp(-np.dot(x, theta)))
    def fit(self,x,y):
        lambda_ = 1
        n,m = x.shape
        X_with_bias = np.c_[np.ones(n),x]
        self.theta = np.zeros(m+1)
        #Gradient
        for i in range(self.iteration):
            #With regularization
            linearmodel = (np.dot(X_with_bias.T, (self.h(self.theta, X_with_bias)-y))/m)
            L1_reg = np.sign(self.theta)*lambda_

            L2_reg = lambda_*self.theta
            linearmodel += L2_reg
            self.theta = self.theta - self.alpha*linearmodel
        
        
    def predict(self,x):
        X_with_bias = np.c_[np.ones(x.shape[0]),x]
        y_predicted = 1/(1+np.exp(-X_with_bias.dot(self.theta)))
        y_result = [1 if i>0.5 else 0 for i in y_predicted]
        return y_result

In [25]:
LR = LogisticRegressionWithGD(alpha=0.01,iteration=1000)
LR.fit(Train_matrix3,train_labels)
LR_predictions = LR.predict(Test_matrix3)

In [26]:
LR_accuracy = np.mean(LR_predictions == test_labels)

In [27]:
print('SimpLe Logistic Regression had an accuracy of {} on the testing set'.format(LR_accuracy))

SimpLe Logistic Regression had an accuracy of 0.9086021505376344 on the testing set


In [44]:
simpletesting=[
'buy on site this product now',
    'Get new phone free Click this URL',
"Don't forget to Subscribe our page in www.hh.com",
"Urgent your Account has been compromised. Verify your details now",
    "Making $5000 a week working from home. No Skills required",
"Your Package delivery failed. Click here to reschedule",
    "Act now this offer expires in 24 hours",
    "Get rich quick with this one simple trick",
    "Warning: your computer is as risk. Download this antivirus now "
]

In [20]:
testing1_matrix = Transform_text(simpletesting,dicWords)

In [21]:
testing1_matrix.shape

(9, 877)

use Large dataset

In [22]:
vocabulary = Create_dictionary(train2_messages)

In [23]:
train2_labels

array([0, 1, 1, ..., 0, 0, 1])

In [24]:
test2_matrix = Transform_text(test_messages,vocabulary)

In [25]:
train2_matrix = Transform_text(train2_messages,vocabulary)

In [28]:
Train_matrix32 = message_to_vector(train2_messages, embeddings)

In [29]:
LR2 = LogisticRegressionWithGD(alpha=0.01,iteration=1000)
LR2.fit(Train_matrix32,train2_labels)

In [32]:
LR_predictions2 = LR2.predict(Test_matrix3)
LR_accuracy2 = np.mean(LR_predictions2 == test_labels)

In [33]:
print('SimpLe Logistic Regression with large dataset had an accuracy of {} on the testing set'.format(LR_accuracy2))

SimpLe Logistic Regression with large dataset had an accuracy of 0.8727598566308243 on the testing set
