# Creating a Spam and Not Spam Classifier with PyTorch

Used Kaggle Dataset : https://www.kaggle.com/ozlerhakan/spam-or-not-spam-dataset

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import torchvision
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

import nltk
from nltk.tokenize import word_tokenize
from nltk import PorterStemmer
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

## Reading Data

In [2]:
data = pd.read_csv('spam_or_not_spam.csv')
data.head()

Unnamed: 0,email,label
0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...,0
1,martin a posted tassos papadopoulos the greek ...,0
2,man threatens explosion in moscow thursday aug...,0
3,klez the virus that won t die already the most...,0
4,in adding cream to spaghetti carbonara which ...,0


## Preprocessing Data

#### Changing lables for ease of understanding

In [3]:
data.dropna(inplace=True)
change_labels = lambda x: 1 if x==0 else 0
data['label'] = data['label'].apply(change_labels)
data.head()

Unnamed: 0,email,label
0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...,1
1,martin a posted tassos papadopoulos the greek ...,1
2,man threatens explosion in moscow thursday aug...,1
3,klez the virus that won t die already the most...,1
4,in adding cream to spaghetti carbonara which ...,1


#### Let's Preprocess text data
* We will remove non words, lower it, then Tokenize, Lemmatize and Vectorize and Remove Stopwords from the data

In [4]:
remove_non_alphabets =lambda x: re.sub(r'[^a-zA-Z]',' ',x)

In [5]:
tokenize = lambda x: word_tokenize(x)

In [6]:
ps = PorterStemmer()
stem = lambda w: [ ps.stem(x) for x in w ]

In [7]:
lemmatizer = WordNetLemmatizer()
leammtizer = lambda x: [ lemmatizer.lemmatize(word) for word in x ]

In [8]:
print('Processing : [=', end='')
data['email'] = data['email'].apply(remove_non_alphabets)
print('=', end='')
data['email'] = data['email'].apply(tokenize) # [ word_tokenize(row) for row in data['email']]
print('=', end='')
data['email'] = data['email'].apply(stem)
print('=', end='')
data['email'] = data['email'].apply(leammtizer)
print('=', end='')
data['email'] = data['email'].apply(lambda x: ' '.join(x))
print('] : Completed', end='')
data.head()

Processing : [=====] : Completed

Unnamed: 0,email,label
0,date wed number aug number number number numbe...,1
1,martin a post tasso papadopoulo the greek scul...,1
2,man threaten explos in moscow thursday august ...,1
3,klez the viru that won t die alreadi the most ...,1
4,in ad cream to spaghetti carbonara which ha th...,1


In [10]:
max_words = 10000
cv = CountVectorizer(max_features=max_words, stop_words='english')
sparse_matrix = cv.fit_transform(data['email']).toarray()

In [11]:
sparse_matrix.shape

(2999, 10000)

In [12]:
x_train, x_test, y_train, y_test = train_test_split(sparse_matrix, np.array(data['label']))

In [13]:
class LogisticRegression(nn.Module):
    def __init__(self):
        super(LogisticRegression, self).__init__()
        self.linear1 = nn.Linear(10000, 100)
        self.linear2 = nn.Linear(100, 10)
        self.linear3 = nn.Linear(10, 2)
        
    def forward(self, x):
        x = F.relu(self.linear1(x))
        x = F.relu(self.linear2(x))
        x = self.linear3(x)
        return x

In [14]:
model = LogisticRegression()

In [15]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.parameters() , lr=0.01)

In [16]:
x_train = Variable(torch.from_numpy(x_train)).float()
y_train = Variable(torch.from_numpy(y_train)).long()

In [17]:
epochs = 20
model.train()
print('Epoch : {}'.format(1), end='')
for epoch in range(epochs):
    optimizer.zero_grad()
    y_pred = model(x_train)
    loss = criterion(y_pred, y_train)
    pred = torch.max(y_pred, 1)[1].eq(y_train).sum()
    acc = pred * 100.0 / len(x_train)
    print('Epoch: {}, Loss: {}, Accuracy: {}'.format(epoch+2, loss.item(), acc.numpy()))
    loss.backward()
    optimizer.step()


 Epoch: 21, Loss: 0.004796540830284357, Accuracy: 99

# Testing

In [18]:
x_test = Variable(torch.from_numpy(x_test)).float()
y_test = Variable(torch.from_numpy(y_test)).long()

In [19]:
model.eval()
with torch.no_grad():
    y_pred = model(x_test)
    loss = criterion(y_pred, y_test)
    pred = torch.max(y_pred, 1)[1].eq(y_test).sum()
    print ("Accuracy : {}%".format(100*pred/len(x_test)))


Accuracy : 99%
