In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!unzip "/content/drive/MyDrive/PROJECT_DATA/SMS Spam Collection/SMS SPAM.zip"

Archive:  /content/drive/MyDrive/PROJECT_DATA/SMS Spam Collection/SMS SPAM.zip
  inflating: spam.csv                


In [None]:
import pandas as pd
import numpy as np

In [None]:
import re
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import f1_score

In [None]:
data = pd.read_csv("/content/spam.csv",encoding='latin')

In [None]:
data

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [None]:
sample = data['v2'][151]

In [None]:
data.isnull().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [None]:
encoder = LabelEncoder()

data['v1'] = encoder.fit_transform(data['v1'])
v1_mapping = {label:encoding for label,encoding in enumerate(encoder.classes_)}

In [None]:
# Takees the SMS and converts it to a list of stemmed words
def processSMS(contents):
    ps = PorterStemmer()

    contents = contents.lower()
    contents = re.sub(r'<[^<>=]+>','',contents)  # replacing HTML text with a blank
    contents = re.sub(r'(http|https)://[^\s]*','httpaddr',contents) # replacing URLs with 'httpaddr'
    contents = re.sub(r'[0-9]','number',contents) # replace any number with 'number'
    contents = re.sub(r'[^\s]+@[^\s]+','emailaddr',contents) # replace email addresses with 'emailaddr'
    contents = re.sub(r'[$]+','dollar',contents) # replacing $ symbol with 'dollar'

    words = word_tokenize(contents)

    # Removing non-alphanumeric characters and stemming
    for i in range(len(words)):
        words[i] = re.sub(r'[^a-zA-Z0-9]','', words[i])
        words[i] = ps.stem(words[i])

    # adding non empty words to a list
    words =[word for word in words if len(word)>=1]

    return words

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
processSMS(sample)

['yup',
 'i',
 'thk',
 'cine',
 'is',
 'better',
 'co',
 'no',
 'need',
 'number',
 'go',
 'down',
 'number',
 'plaza',
 'mah']

In [None]:
def getVocabulary(messages,vocab_length):

    vocabulary=dict()
    # Iterating through the list of SMS messages
    for message in messages:
        # Processing an individual SMS
        message = processSMS(message)

        # Adding words found in that message to our vocabulary dictionary with frequency
        for word in message:
            if word in vocabulary.keys():
                vocabulary[word]+=1
            else:
                vocabulary[word]=1

    # Sorting our vocabulary as a nested list in the descending order of frequency
    vocabulary = sorted(vocabulary.items(),  # returns the dict. in the form of nested list
                        key = lambda x : x[1], # it makes the dict. use the frequency of a word istead of the word itself as the key
                        reverse=True) # descending order

    # Adding only the top 2000 (vocab_length) words to vocabulary (now a list) in the descending order of frequency
    vocabulary = list(map(lambda x : x[0],  # It adds only the word not its frequency
                          vocabulary[0:vocab_length]))

    # Adding indices to vocab and making it back to a dictionary for easy access later
    vocabulary = {index: word for index,word in enumerate(vocabulary)}
    return vocabulary

In [None]:
getVocabulary(sample,10)


{0: 'e',
 1: 'n',
 2: 'o',
 3: 'i',
 4: 't',
 5: 'a',
 6: 'p',
 7: 'h',
 8: 'c',
 9: 's'}

In [None]:
def getKey(dictionary,value):

    for key,val in dictionary.items():
        if val == value:
            return key

    return None


In [None]:
# Get indices of a given SMS message
def getIndices(message,vocabulary):
    # We are storing the unique indices only
    word_indices = set()

    for word in message:
        if word in vocabulary.values():
            word_indices.add(getKey(vocabulary,word)) # adding only occurances of those vocab dictionary words to word_indices

    return word_indices

In [None]:
def getFeatureVector(word_indices,vocab_length):
    feature_vec = np.zeros(vocab_length)

    for i in word_indices:
        feature_vec[i]=1

    return feature_vec

In [None]:
X = data.copy() # X currently has both v1 and v2 as columns
y = data['v1']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.8,shuffle=True,random_state=42)


In [None]:
vocab_length = 2000


In [None]:
vocabulary = getVocabulary(X_train['v2'].to_list(),vocab_length)


In [None]:
def preprocess_data(df,vocab,vocab_len):

    df = df.copy()

    # processing emails in df and storing it in messages as a list
    messages = df['v2'].to_list()
    messages = list(map(lambda x : processSMS(x),messages))

    # replacing the data in df with the occurance of the top 2000 words in our vocabulary
    df =list(map(lambda   x : getFeatureVector(getIndices(x,vocab),vocab_len),messages))
    # converting the floats to int type
    df = pd.DataFrame(np.array(df).astype(np.int16))

    return df

In [None]:
X_train = preprocess_data(X_train,vocabulary,vocab_length)


In [None]:
X_test = preprocess_data(X_test,vocabulary,vocab_length)


In [None]:
model = SVC(random_state=42)

In [None]:
model.fit(X_train,y_train)

In [None]:
model.score(X_test,y_test)


0.9838565022421525

In [None]:
y_pred = model.predict(X_test)

f1_score(y_test, y_pred)

0.9361702127659575