In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#NLP
import re
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

#Preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split as tts

#Model
from sklearn.svm import SVC
from sklearn.metrics import f1_score

In [None]:
df = pd.read_csv('../input/sms-spam-collection-dataset/spam.csv', encoding='latin-1')

In [None]:
df.head(10)

# Data Cleaning

In [None]:
df = df.drop([df.columns[col] for col in [2,3,4]],axis=1)

# Preprocessing

In [None]:
encoder = LabelEncoder()

df['v1'] = encoder.fit_transform(df['v1'])
class_mappings = {index: label for index,label in enumerate(encoder.classes_)}

In [None]:
class_mappings

### Stemming words

In [None]:
def processEmail(contents):
    ps = PorterStemmer()
    
    contents = contents.lower()
    contents = re.sub(r'<[^<>]+>', ' ', contents)
    contents = re.sub(r'[0-9]+', 'number', contents)
    contents = re.sub(r'(http|https)://[^\s]*', 'httpaddr', contents)
    contents = re.sub(r'[^\s]+@[^\s]+', 'emailaddr', contents)
    contents = re.sub(r'[$]+', 'dollar', contents)
    
    words = word_tokenize(contents)
    
    for i in range(len(words)):
        words[i] = re.sub(r'[^a-zA-Z0-9]', '', words[i])
        words[i] = ps.stem(words[i])
        
    words = [word for word in words if len(word) >= 1]
    
    return words

In [None]:
def getVocabulary(emails, vocab_length):
    vocabulary = dict()
    
    for i in range(len(emails)):
        emails[i] = processEmail(emails[i])
        for word in emails[i]:
            if word in vocabulary.keys():
                vocabulary[word] += 1
            else:
                vocabulary[word] = 1
                
    vocabulary = sorted(vocabulary.items(), key=lambda x: x[1], reverse=True)
    vocabulary = list(map(lambda x: x[0], vocabulary[0:vocab_length]))
    vocabulary = {index: word for index, word in enumerate(vocabulary)}
    
    return vocabulary


In [None]:
getVocabulary(df['v2'].to_list(),2500)

In [None]:
def getKey(dictionary, val):
    for key, value in dictionary.items():
        if value == val:
            return key

In [None]:
def getIndices(email, vocabulary):
    word_indices = set()
    
    for word in email:
        if word in vocabulary.values():
            word_indices.add(getKey(vocabulary, word))
    
    return word_indices

In [None]:
def getFeatureVector(word_indices,vocab_length):
    feature_vec = np.zeros(vocab_length)
    for i in word_indices:
        feature_vec[i] = 1
    
    return feature_vec

In [None]:
vocabulary  = getVocabulary(df['v2'].to_list(),2500)

emails = df['v2'].to_list()
emails = list(map(lambda x: processEmail(x),emails))


In [None]:
X = list(map(lambda x : getFeatureVector(getIndices(x,vocabulary),2500),emails))

In [None]:
X = pd.DataFrame(np.array(X).astype(np.int16))

In [None]:
X

In [None]:
Y = df['v1']

# Training

In [None]:
x_train,x_test,y_train,y_test = tts(X,Y,train_size=0.8,random_state=0)

In [None]:
model = SVC()
model.fit(x_train,y_train)

# Performance

In [None]:
model.score(x_test,y_test)

In [None]:
y_pred = model.predict(x_test)

In [None]:
f1_score(y_test,y_pred)