# Example: SMS: SPAM or HAM (Kaggle)

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
## Read Data for the Fraudulent Email Kaggle Challenge
data = pd.read_csv("data/kg_train.csv",encoding='latin-1')

# Reduce the training set to speead up development. 
# Modify for final system
data = data.head(1000)
print(data.shape)
data.fillna("",inplace=True)

(1000, 2)


In [3]:
# Let's divide the training and test set into two partitions
from sklearn.model_selection import train_test_split

data_train, data_val, label_train, label_val = \
    train_test_split(data, data["label"], test_size=0.3, random_state=5)

In [4]:
data_train.head()

Unnamed: 0,text,label
904,I will prepare for you to sign next week. Enjo...,0
132,fvi,0
942,Got it.On Mon May 4 2009 at 4:08 PM H <HDR22@c...,0
784,NAME=3ADR=2E OLUSEGUN SMITHdr=2Eolusegun=2Esmi...,1
681,Fyi,0


## Data Preprocessing

In [5]:
import string
from nltk.corpus import stopwords
print(string.punctuation)
print(stopwords.words("english")[100:110])
from nltk.stem.snowball import SnowballStemmer
snowball = SnowballStemmer('english')

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
['here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each']


In [6]:
data_train.head()

Unnamed: 0,text,label
904,I will prepare for you to sign next week. Enjo...,0
132,fvi,0
942,Got it.On Mon May 4 2009 at 4:08 PM H <HDR22@c...,0
784,NAME=3ADR=2E OLUSEGUN SMITHdr=2Eolusegun=2Esmi...,1
681,Fyi,0


In [7]:
import re
def clean_text(text):
    
    # Remove all the special characters
    processed_feature = re.sub(r'\W', ' ', str(text))

    # remove all single characters
    processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)

    # Remove single characters from the start
    processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature) 

    # Substituting multiple spaces with single space
    processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)

    # Removing prefixed 'b'
    processed_feature = re.sub(r'^b\s+', '', processed_feature)

    # Converting to Lowercase
    processed_feature = processed_feature.lower()

    
    return processed_feature



data_train.loc[:,'preprocessed_text'] = data_train['text'].apply(clean_text)
data_val.loc[:,'preprocessed_text'] = data_val['text'].apply(clean_text)

data_train.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Unnamed: 0,text,label,preprocessed_text
904,I will prepare for you to sign next week. Enjo...,0,i will prepare for you to sign next week enjoy...
132,fvi,0,fvi
942,Got it.On Mon May 4 2009 at 4:08 PM H <HDR22@c...,0,got it on mon may 4 2009 at 4 08 pm hdr22 clin...
784,NAME=3ADR=2E OLUSEGUN SMITHdr=2Eolusegun=2Esmi...,1,name 3adr 2e olusegun smithdr 2eolusegun 2esmi...
681,Fyi,0,fyi


## Bag Of Words
Let's get the 10 top words in ham and spam messages

In [8]:
from collections import Counter

data_ham  = data_train[data_train['label'] == 0].copy()
data_spam = data_train[data_train['label'] == 1].copy()

words_data_ham  = data_ham['preprocessed_text']
words_data_spam = data_spam['preprocessed_text']


list_ham_words = []
for sublist in words_data_ham:
    for item in sublist.split():
        list_ham_words.append(item)

list_spam_words = []
for sublist in words_data_spam:
    for item in sublist.split():
        list_spam_words.append(item)
        
c_ham  = Counter(list_ham_words)
c_spam = Counter(list_spam_words)
df_hamwords_top10  = pd.DataFrame(c_ham.most_common(10),  columns=['word', 'count'])
df_spamwords_top10 = pd.DataFrame(c_spam.most_common(10), columns=['word', 'count'])

df_spamwords_top10

Unnamed: 0,word,count
0,the,5026
1,to,3966
2,of,3531
3,and,2881
4,in,2322
5,you,2270
6,this,1890
7,my,1507
8,2e,1491
9,your,1489


## Bag of Words with Count Vectorizer

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
bow_transformer = CountVectorizer().fit(data_train['preprocessed_text'])

In [10]:
print(len(bow_transformer.vocabulary_))
sample_spam = data_train['preprocessed_text'][0]
bow_sample_spam = bow_transformer.transform([sample_spam])

# Lets look at some vectorization example for a spam email
print(sample_spam)
print(bow_sample_spam)

17120
dear sir strictly private business proposal am mike chukwu the manager bills and exchange at the foreign remittance department of the zenith international bank plc am writing this letter to ask for your support and cooperation to carry out this business opportunity in my department we discovered an abandoned sum of 15 000 000 00 fifteen million united states dollars only in an account that belongs to one of our foreign customers who died along with his entire family of wife and two children in november 1997 in plane crash since we heard of his death we have been expecting his next of kin to come over and put claims for his money as the heir because we cannot release the fund from his account unless someone applies for claim as the next of kin to the deceased as indicated in our banking guidelines unfortunately neither their family member nor distant relative has ever appeared to claim the said fund upon this discovery and other officials in my department have agreed to make busin

In [11]:
## let's vectorize all dataset
X_train = bow_transformer.transform(data_train['preprocessed_text'])
X_val  = bow_transformer.transform(data_val['preprocessed_text'])

#Let's print the shape of the vetorized dataset
print(X_train.shape)
print(X_val.shape)


(700, 17120)
(300, 17120)


## Train a Classifier

In [12]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


#Learn Classifier
clf = MultinomialNB().fit(X_train, label_train)
#Predict Val data
pred_val = clf.predict(X_val)

accuracy = accuracy_score(label_val, pred_val)
print(accuracy)
confusion_matrix(label_val, pred_val)

0.91


array([[152,  25],
       [  2, 121]])

### TASK - Implement a SPAM/HAM classifier

https://www.kaggle.com/t/3b0207700e7b44f4a96d50b8188c16b4

The classifier can not be changed!!! It must be the MultinimialNB with default parameters!

Your task is to find the **best feature representation**.

You can work with teams of two persons.

**Deadline**: 30/11/2020


In [13]:
data_test = pd.read_csv("data/kg_test.csv",encoding='latin-1')
X_test = bow_transformer.transform(data_test['text'].apply(clean_text))
pred_text = clf.predict(X_test)

submission_file = pd.DataFrame({'Id': data_test.index,'Category':pred_text})
submission_file.to_csv('data/to_submit.csv',index=False)