In [4]:
# for DL modeling
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# for number-crunching
import numpy as np
import scipy.stats as stats

# for dataset management
import os
import pandas as pd

# for timing computations
import time

# for data visualization
import matplotlib.pyplot as plt
import seaborn as sns
from IPython import display
display.set_matplotlib_formats('svg')

# for DL processing
import torch.optim as optim
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# for printing out status reports
import sys

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
os.chdir('C:/Users/athorat/OneDrive - Nice Systems Ltd/00_Amit Thorat Data/ISB - AMPBA/01_Project/Term4_DL\Assignment\Part 2')
spamData = pd.read_csv('SPAM text message 20170820 - Data.csv')
spamData.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
# Calculate the maximum number of words in the 'Message' column
max_words = spamData['Message'].str.split().apply(len).max()
print("Maximum number of words in 'Message' column:", max_words)

Maximum number of words in 'Message' column: 171


In [5]:
# Calculate the maximum length of a sentence in the 'Message' column
max_length = spamData['Message'].str.len().max()
print("Maximum length of a sentence in 'Message' column:", max_length)

Maximum length of a sentence in 'Message' column: 910


In [6]:
# Split the data into training and testing sets
X = spamData['Message']
y = spamData['Category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Tokenize the text data
max_words = 300
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

NameError: name 'Tokenizer' is not defined

In [None]:
# Convert text data to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [None]:
# Pad sequences to a fixed length (you can choose a suitable max_length)
max_length = 1000  # You can adjust this based on your dataset and computing resources
X_train_padded = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_length, padding='post')

In [None]:
# Convert 'spam' to 1 and 'ham' to 0 in your target labels
y_train = y_train.apply(lambda x: 1 if x == 'spam' else 0)
y_test = y_test.apply(lambda x: 1 if x == 'spam' else 0)

train_data = TensorDataset(torch.from_numpy(X_train_padded).long(), torch.from_numpy(y_train.values).long())
test_data = TensorDataset(torch.from_numpy(X_test_padded).long(), torch.from_numpy(y_test.values).long())


# Create a PyTorch DataLoader
batch_size = 32
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True,drop_last=True)
test_loader = DataLoader(test_data, batch_size=test_data.tensors[0].shape[0], shuffle=True)

In [None]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
device

In [None]:
# Define the RNN model
class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1):
        super(RNNModel, self).__init__()
        
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.rnn = nn.RNN(hidden_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.rnn(x)
        out = self.fc(out[:, -1, :])
        out = self.softmax(out)
        return out

In [None]:
# Move the model to the GPU if available
model = RNNModel(input_size, hidden_size, output_size, num_layers).to(device)

In [None]:
# Define loss and optimizer
criterion = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
# Training loop
num_epochs = 10
batch_size = 64
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    for batch_idx, (inputs, labels) in enumerate(train_loader):
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / (batch_idx + 1)
    print(f"Epoch [{epoch + 1}/{num_epochs}] - Loss: {avg_loss:.4f}")

In [None]:
# Evaluation on the testing set
model.eval()  # Set the model to evaluation mode
predictions = []
true_labels = []

with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        predictions.extend(predicted.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(true_labels, predictions)
print(f"Accuracy on the testing set: {accuracy * 100:.2f}%")

In [None]:
# Define hyperparameters and architecture options for experimentation
param_grid = {
    'hidden_size': [64, 128, 256],
    'num_layers': [1, 2, 3],
    'learning_rate': [0.001, 0.01, 0.1],
    'num_epochs': [5, 10, 15]
}

In [None]:
# Initialize the RNN model
input_size = max_words  # Size of your vocabulary
output_size = 2  # Two classes: spam or ham

# Create a PyTorch DataLoader for the training data
batch_size = 64
train_data = TensorDataset(torch.from_numpy(X_train_padded).long(), torch.from_numpy(y_train.values).long())
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

In [None]:
# Perform grid search for hyperparameters and architecture
best_accuracy = 0.0
best_params = {}

for hidden_size in param_grid['hidden_size']:
    for num_layers in param_grid['num_layers']:
        for learning_rate in param_grid['learning_rate']:
            for num_epochs in param_grid['num_epochs']:
                print(f"Testing hyperparameters: hidden_size={hidden_size}, num_layers={num_layers}, "
                      f"learning_rate={learning_rate}, num_epochs={num_epochs}")
                
                # Create and train a model with current hyperparameters
                model = RNNModel(input_size, hidden_size, output_size, num_layers)
                current_accuracy = train_and_evaluate_model(model, train_loader, test_loader, num_epochs, learning_rate)
                
                # Check if current model is the best
                if current_accuracy > best_accuracy:
                    best_accuracy = current_accuracy
                    best_params = {
                        'hidden_size': hidden_size,
                        'num_layers': num_layers,
                        'learning_rate': learning_rate,
                        'num_epochs': num_epochs
                    }

print(f"Best accuracy: {best_accuracy * 100:.2f}%")
print("Best hyperparameters:", best_params)