In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
!pip install torch



In [3]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.feature_extraction.text import CountVectorizer
import csv
import time
import re, sys
import pandas as pd

class SpamClassifier(nn.Module):
    def __init__(self, input_size):
        super(SpamClassifier, self).__init__()
        self.linear1 = nn.Linear(input_size, 16)
        self.linear2 = nn.Linear(16, 1)
        self.activation = nn.Sigmoid()
    def forward(self, x):
        x = self.linear1(x)
        x = self.activation(x)
        x = self.linear2(x)
        x = self.activation(x)
        return x

class SpamDataset(Dataset):
    def __init__(self, csv_file):
        self.data = []
        self.vectorizer = CountVectorizer()
        messages = []
        labels = []
        with open(csv_file, "r") as f:
            csv_reader = csv.reader(f)
            for row in csv_reader:
                if len(row) == 2:
                    label, message = row
                    messages.append(message)
                    labels.append(int(label == 'spam'))  # Convert label to integer
        # Convert messages to vectors
        message_vectors = self.vectorizer.fit_transform(messages).toarray()
        for vector, label in zip(message_vectors, labels):
            self.data.append((vector, label))
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        message_vector, label = self.data[idx]
        return torch.tensor(message_vector, dtype=torch.float32), torch.tensor(label, dtype=torch.float32)

class Reprinter:
    def __init__(self):
        self.text = ''
    def clear_line(self):
        """Clears the line before printing the new text."""
        sys.stdout.write('\033[F')  # Move cursor up one line
        sys.stdout.write('\r' + ' ' * len(self.text))
    def __call__(self, text):
        """Prints `text` and clears the previous line."""
        self.clear_line()
        print(text, end='', flush=True)
        self.text = text
        sys.stdout.flush()
def train(model, train_data, train_loader, optimizer, loss_fn, epochs):
    reprint = Reprinter()
    start_time = time.time()  # Record the start time of training
    for epoch in range(epochs):
        model.train()
        for batch_idx, (data, target) in enumerate(train_loader):
            data = data.to(device)
            target = target.to(device)
            optimizer.zero_grad()
            output = model(data).squeeze()  # Remove the extra dimension from output
            loss = loss_fn(output, target)
            loss.backward()
            optimizer.step()
            if batch_idx % 10 == 0:
                reprint("Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
                    epoch, batch_idx * len(data), len(train_data), 100.0 * batch_idx / len(train_loader), loss.item()
                ))
    end_time = time.time()  # Record the end time of training
    elapsed_time = end_time - start_time  # Calculate the elapsed time
    print(f"\nTraining took approximately {elapsed_time:.2f} seconds")
# Define the predict function
def predict(model, message):
    # Convert the input message to a tensor
    message_vector = torch.tensor(train_dataset.vectorizer.transform([message]).toarray(), dtype=torch.float32).to(device)
    # Move the model to the appropriate device
    model = model.to(device)
    # Make the prediction
    output = model(message_vector)
    confidence = output.item() * 100.0
    return confidence
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
  deviceName = "GPU"
else:
  deviceName = "CPU"

# @title Training Settings
choice = "Small (469 KB)" # @param ["Small (469 KB)", "Medium (10 MB)"]
model_name = "spam_classifier_small" # @param {type:"string"}
train_now = True # @param {type:"boolean"}
if train_now:
  print("Using", deviceName)
  print("____________________________________________________________")
csv.field_size_limit(sys.maxsize)
if choice=="Small (469 KB)":
  train_dataset = SpamDataset('/kaggle/input/spam-dataset/spam.csv')
elif choice=="Medium (10 MB)":
  train_dataset = SpamDataset('/kaggle/input/spam-dataset/spam_20.csv')
#elif choice=="Large (36 MB)":
#  train_dataset = SpamDataset("spam_large.csv")
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
input_size = len(train_dataset.vectorizer.get_feature_names_out())
print(f"INPUT_SIZE FOR {model_name}: {input_size}")
model = SpamClassifier(input_size)
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.BCELoss()
if train_now:
  train(model, train_dataset, train_loader, optimizer, loss_fn, epochs=20)
  # Save the model
  torch.save(model.state_dict(), f"{model_name}.pt")

# @title Inference { form-width: "50%" }
text_input = "SEND ME MONEY" # @param {type:"string"}
accuracy = 3 # @param {type:"slider", min:1, max:13, step:1}
model_select = "spam_classifier_small" # @param ["spam_classifier_medium", "spam_classifier_small"]
# Load the model
#if model_select == "spam_classifier_small":
#  train_dataset = SpamDataset("spam.csv")
#elif model_select == "spam_classifier_medium":
#  train_dataset = SpamDataset("spam_20.csv")
model = SpamClassifier(input_size)
model.load_state_dict(torch.load(f"{model_select}.pt"))
# Make a prediction
message = text_input
confidence = predict(model, message)
confidence = round(confidence, accuracy)
print(f"Confidence rate: {confidence}%")

Using CPU
____________________________________________________________
INPUT_SIZE FOR spam_classifier_small: 8709
Training took approximately 10.08 seconds
Confidence rate: 1.176%
