In [1]:
import numpy as np
import pandas as pd

# NLP
import re
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# Preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Modeling
from sklearn.svm import SVC
from sklearn.metrics import f1_score

In [2]:
data = pd.read_csv('spam.csv', encoding='latin-1')

In [3]:
data

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [4]:
data.drop([data.columns[col] for col in [2, 3, 4]], axis=1, inplace=True)

In [5]:
data

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [6]:
encoder = LabelEncoder()

data['v1'] = encoder.fit_transform(data['v1'])
class_mappings = {index: label for index, label in enumerate(encoder.classes_)}

In [7]:
data

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will Ì_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [8]:
class_mappings

{0: 'ham', 1: 'spam'}

In [9]:
# Take an email string and convert it to a list of stemmed words
def processEmail(contents):
    ps = PorterStemmer()
    
    contents = contents.lower()
    contents = re.sub(r'<[^<>]+>', ' ', contents)  # if it finds any html <> tag then it'll replace it by spcae ''
    contents = re.sub(r'[0-9]+', 'number', contents) # if it finds any 0-9 tag then it'll replace it by 'number'
    contents = re.sub(r'(http|https)://[^\s]*', 'httpaddr', contents)
    contents = re.sub(r'[^\s]+@[^\s]+', 'emailaddr', contents)
    # if it finds any email @ then it'll replace it by ''spcae 'emailaddr'
    contents = re.sub(r'[$]+', 'dollar', contents)
    # if it finds any $ then it'll replace it by 'dollar'
    
    words = word_tokenize(contents)
    
    for i in range(len(words)):
        words[i] = re.sub(r'[^a-zA-Z0-9]', '', words[i])
        words[i] = ps.stem(words[i])
        
    words = [word for word in words if len(word) >= 1]
    
    return words

In [10]:
# Take a list of emails and get a dictionary of the most common words
def getVocabulary(emails, vocab_length):
    vocabulary = dict()
    
    for i in range(len(emails)):
        emails[i] = processEmail(emails[i])
        for word in emails[i]:
            if word in vocabulary.keys():
                vocabulary[word] += 1
            else:
                vocabulary[word] = 1
                
    vocabulary = sorted(vocabulary.items(), key=lambda x: x[1], reverse=True)
    vocabulary = list(map(lambda x: x[0], vocabulary[0:vocab_length]))
    vocabulary = {index: word for index, word in enumerate(vocabulary)}
    
    return vocabulary

In [11]:
# Get a dictionary key given a value
def getKey(dictionary, val):
    for key, value in dictionary.items():
        if value == val:
            return key


In [12]:
# Get the indices of vocab words used in a given email
def getIndices(email, vocabulary):
    word_indices = set()
    
    for word in email:
        if word in vocabulary.values():
            word_indices.add(getKey(vocabulary, word))
    
    return word_indices

In [13]:
def getFeatureVector(word_indices, vocab_length):
    feature_vec = np.zeros(vocab_length)
    
    for i in word_indices:
        feature_vec[i] = 1
        
    return feature_vec

In [14]:
vocab_length = 2000

In [15]:
vocabulary = getVocabulary(data['v2'].to_list(), vocab_length)

emails = data['v2'].to_list()
emails = list(map(lambda x: processEmail(x), emails))

LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/english.pickle[0m

  Searched in:
    - 'C:\\Users\\Rony/nltk_data'
    - 'E:\\Users\\Rony\\anaconda3\\nltk_data'
    - 'E:\\Users\\Rony\\anaconda3\\share\\nltk_data'
    - 'E:\\Users\\Rony\\anaconda3\\lib\\nltk_data'
    - 'C:\\Users\\Rony\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - ''
**********************************************************************


In [None]:
X = list(map(lambda x: getFeatureVector(getIndices(x, vocabulary), vocab_length), emails))
X = pd.DataFrame(np.array(X).astype(np.int16))

In [None]:
y = data['v1']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

In [None]:
model = SVC()

model.fit(X_train, y_train)

In [None]:
model.score(X_test, y_test)

In [None]:
y_pred = model.predict(X_test)

In [None]:
f1_score(y_test, y_pred)