In [17]:
import pandas as pd
import numpy as np

In [18]:
csv_file_path = 'spam.csv' 
sms_data = pd.read_csv(csv_file_path, usecols=[0, 1], names=['label', 'message'], header=0, encoding='ISO-8859-1')

In [20]:
import string
import re

In [21]:
def preprocess_text(text):
    text = text.lower()  #convert into lower case 
    text = re.sub(f'[{string.punctuation}]', '', text) #remove the punctuations 
    text = re.sub('\d+', '', text)  #remove digits 
    return text

In [22]:
sms_data['message'] = sms_data['message'].apply(preprocess_text)

In [23]:
sms_data['label'] = sms_data['label'].map({'ham': 0, 'spam': 1})

In [24]:
print(sms_data.head())

   label                                            message
0      0  go until jurong point crazy available only in ...
1      0                            ok lar joking wif u oni
2      1  free entry in  a wkly comp to win fa cup final...
3      0        u dun say so early hor u c already then say
4      0  nah i dont think he goes to usf he lives aroun...


In [25]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [27]:
X_train, X_test, y_train, y_test = train_test_split(sms_data['message'], sms_data['label'], test_size=0.2, random_state=42)

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [29]:
vectorizer = TfidfVectorizer(stop_words='english')
X_train_v = vectorizer.fit_transform(X_train)
X_test_v = vectorizer.transform(X_test)

In [30]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}

In [31]:
lr = LogisticRegression()

In [32]:
grid_search = GridSearchCV(lr, param_grid, scoring='f1', cv=5)
grid_search.fit(X_train_v, y_train)



In [40]:
best_params = grid_search.best_params_
model = LogisticRegression(**best_params)
model.fit(X_train_v, y_train)

In [41]:
y_pred = model.predict(X_test_v)

In [42]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [43]:
acc = accuracy_score(y_test, y_pred)
p = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [44]:
print(f'Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}')

Accuracy: 0.9748878923766816, Precision: 0.9552238805970149, Recall: 0.8533333333333334, F1 Score: 0.9014084507042254


In [45]:

def predict_message(text):
    text = preprocess_text(text)
    text_v = vectorizer.transform([text])
    prediction = model.predict(text_v)
    return 'spam' if prediction == 1 else 'ham'

mess = "Congratulations! You've won a free ticket to Bahamas. Text WIN to 12345."
print(predict_message(mess))

spam


In [46]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.metrics import Precision, Recall




In [47]:
sms_data['message'] = sms_data['message'].apply(preprocess_text)
sms_data['label'] = LabelEncoder().fit_transform(sms_data['label'])

In [48]:
X_train, X_test, y_train, y_test = train_test_split(sms_data['message'], sms_data['label'], test_size=0.2, random_state=42)

In [49]:
vectorizer = TfidfVectorizer(stop_words='english')
X_train_v = vectorizer.fit_transform(X_train).toarray()
X_test_v = vectorizer.transform(X_test).toarray()

In [51]:
model2 = Sequential()
model2.add(Dense(128, input_dim=X_train_tfidf.shape[1], activation='relu'))
model2.add(Dense(64, activation='relu'))
model2.add(Dense(1, activation='sigmoid'))

In [52]:
model2.compile(optimizer=Adam(learning_rate=0.001), 
              loss=BinaryCrossentropy(), 
              metrics=[Precision(), Recall()])

In [54]:
model2.fit(X_train_v, y_train, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x271aef37d90>

In [56]:
loss, precision, recall = model2.evaluate(X_test_v, y_test)
f1 = 2 * (precision * recall) / (precision + recall)



In [57]:
print(f'Loss: {loss}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}')

Loss: 0.11329773813486099, Precision: 0.9558823704719543, Recall: 0.8666666746139526, F1 Score: 0.9090909213913444
