# Spam classifier using classic ML

In [144]:
import pandas as pd

url = 'https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv'
data = pd.read_csv(url, sep='\t', header=None, names=["label", "message"])

In [146]:
data.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [148]:
import unicodedata
import sys
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

punctuation = dict.fromkeys(i for i in range(sys.maxunicode)
                            if unicodedata.category(chr(i)).startswith('P'))

def preprocess_text(text):
    if isinstance(text, str): 
        text = text.strip()  
        text = text.lower()
        text = text.translate(punctuation)  
        words = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]
        return ' '.join(words)  
    return ''  

data['cleaned_message'] = data['message'].apply(preprocess_text)



data.head()

Unnamed: 0,label,message,cleaned_message
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah dont think go usf life around though


In [150]:
data.describe()

Unnamed: 0,label,message,cleaned_message
count,5572,5572,5572
unique,2,5169,5117
top,ham,"Sorry, I'll call later",sorry ill call later
freq,4825,30,30


In [152]:
data_cleaned = data.drop_duplicates(subset=['cleaned_message'])
data_cleaned.head()

Unnamed: 0,label,message,cleaned_message
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah dont think go usf life around though


In [158]:
data_dcleanned = data_cleaned.drop('message', axis=1)
data_dcleanned.head()

Unnamed: 0,label,cleaned_message
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif u oni
2,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,u dun say early hor u c already say
4,ham,nah dont think go usf life around though


In [160]:
data_cleaned.describe()

Unnamed: 0,label,message,cleaned_message
count,5117,5117,5117
unique,2,5117,5117
top,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
freq,4485,1,1


In [162]:
data['label'].value_counts()

label
ham     4825
spam     747
Name: count, dtype: int64