## Question 3

### Sub-question 2

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
import tensorflow as tf
import keras
import warnings
warnings.filterwarnings('ignore')
import torch
from torch import nn

In [2]:
data = pd.read_csv('C:/Users/subnarasimhan/Desktop/SPAM.csv')

In [3]:
#splitting the data into labels and messages
messages = data['Message'].values
labels = data['Category'].apply(lambda x: 1 if x == 'spam' else 0).values

In [4]:
#tokenizing the messages
tokenizer = Tokenizer()
tokenizer.fit_on_texts(messages)
sequences = tokenizer.texts_to_sequences(messages)

In [5]:
#padding the sequences to match the length
max_length = max(len(sequences) for sequence in sequences)
sequences = pad_sequences(sequences, maxlen=max_length)

In [6]:
#splitting the data into train and test
messages_train, messages_test, labels_train, labels_test = train_test_split(sequences,labels,test_size=0.2, random_state=42)

### Sub-question 3

In [7]:
#building the model
model = Sequential([Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim = 32, input_length=max_length), LSTM(32), Dense(1,activation='sigmoid')])

In [8]:
#compiling the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

### Sub-question 4

In [9]:
#training the model
model.fit(messages_train, labels_train, epochs=1, validation_data=(messages_test, labels_test))

[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m281s[0m 2s/step - accuracy: 0.8657 - loss: 0.4031 - val_accuracy: 0.9749 - val_loss: 0.1030


<keras.src.callbacks.history.History at 0x25a9f2b9090>

In [10]:
#evaluation of the model
loss,accuracy = model.evaluate(messages_test,labels_test)
print(f"Test Accuracy: {accuracy*100:.2f}%")

[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 291ms/step - accuracy: 0.9757 - loss: 0.0923
Test Accuracy: 97.49%


### Sub-question 5

In [17]:
#trying out for different learning rates
learning_rates = [0.001, 0.01, 0.1]
for lr in learning_rates:
    model = Sequential([Embedding(input_dim = len(tokenizer.word_index) + 1, output_dim=32, input_length=max_length), LSTM(32), Dense(1, activation='sigmoid')])
    
    #compiling the model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    #training the model
    model.fit(messages_train, labels_train, epochs=1, validation_data=(messages_test, labels_test))
    
    #evaluation of the model
    loss,accuracy = model.evaluate(messages_test,labels_test)
    print(f'Learning rate: {lr}, Test Accuracy: {accuracy*100:.2f}%')

[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m257s[0m 2s/step - accuracy: 0.8553 - loss: 0.3701 - val_accuracy: 0.9776 - val_loss: 0.0836
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 294ms/step - accuracy: 0.9795 - loss: 0.0750
Learning rate: 0.001, Test Accuracy: 97.76%
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m272s[0m 2s/step - accuracy: 0.8715 - loss: 0.3854 - val_accuracy: 0.9812 - val_loss: 0.0813
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 294ms/step - accuracy: 0.9839 - loss: 0.0734
Learning rate: 0.01, Test Accuracy: 98.12%
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m282s[0m 2s/step - accuracy: 0.8819 - loss: 0.3735 - val_accuracy: 0.9776 - val_loss: 0.0885
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 290ms/step - accuracy: 0.9766 - loss: 0.0791
Learning rate: 0.1, Test Accuracy: 97.76%
