In [7]:
import re
import pandas as pd
import numpy as np
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,confusion_matrix


### Spam Classification
Deciding whether an email is spam or not.

## Step 1 loading the dataset

In [14]:


#load dataset
df=pd.read_csv('spam.csv', encoding='latin-1')
df=df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis='columns')

#df[v1] is the class variable and df[v2] is the  email
df

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


## Step 2: removing stopwords and stemming

In [15]:
stemmer=SnowballStemmer('english')
#A  stemming algorithm reduces words like fishing, fished, and fisher to the stem fish.
#The stem need not be a word, for example  argue, argued, 
#argues, arguing, and argus could be reduced to the stem argu. 

stop=set(stopwords.words('english'))
#Stop words are  the most common words in a language
#and are filtered out before processing of natural language data 


df['v2']=[re.sub('[^a-zA-Z]', ' ', sms) for sms in df['v2']]
word_list=[sms.split() for sms in df['v2']]
def normalize(words):
    current_words=list()
    for word in words:
        if word.lower() not in stop: #remove  the most common words
            updated_word=stemmer.stem(word) #stemming
            current_words.append(updated_word.lower()) #lower case
    return current_words
word_list=[normalize(word) for word in word_list]
df['v2']=[" ".join(word) for word in word_list]


In [16]:
#df[v1] is the class variable and df[v2] is the processed email
df

Unnamed: 0,v1,v2
0,ham,go jurong point crazi avail bugi n great world...
1,ham,ok lar joke wif u oni
2,spam,free entri wkli comp win fa cup final tkts st ...
3,ham,u dun say earli hor u c alreadi say
4,ham,nah think goe usf live around though
5,spam,freemsg hey darl week word back like fun still...
6,ham,even brother like speak treat like aid patent
7,ham,per request mell mell oru minnaminungint nurun...
8,spam,winner valu network custom select receivea pri...
9,spam,mobil month u r entitl updat latest colour mob...


In [None]:
#split in training and testing
x_train, x_test, y_train, y_test=train_test_split(df['v2'], df['v1'], test_size=0.2, random_state=7)



## Step 3: transforming email into numerical string

In [17]:
#it counts the words
cv=CountVectorizer()
#it returns the number of times a word appears in the i-th email
x_train_df=cv.fit_transform(x_train) #x_train_df is a matrix emails times words
print("number of emails=",x_train_df.shape[0])
print("number of words=",x_train_df.shape[1])
x_test_df=cv.transform(x_test)


#this is a sparse matrix (it means that only non-zeroes elements are stored)
x_train_df

number of emails= 4457
number of words= 5595


<4457x5595 sparse matrix of type '<class 'numpy.int64'>'
	with 35900 stored elements in Compressed Sparse Row format>

In [33]:
row_index=0 #select one email
print(x_train_df[row_index,:].todense().shape)
print("this is the non-sparse matrix=",x_train_df[row_index,:].todense())
ind=np.where(x_train_df[row_index,:].todense()[0,:]>0)[1]
print()
#original words in the email
print(x_train.values[row_index])
print()
#decoded numerical input 
print(cv.inverse_transform(x_train_df[row_index,:].todense()))
print()
#index of those words in x_train_df[row_index,:].todense()
print(ind)
print()
# number of times those words appears in the email
print(x_train_df[row_index,ind].todense())

(1, 5595)
this is the non-sparse matrix= [[0 0 0 ... 0 0 0]]

wish great day moji told offer alway speechless offer easili go great length behalf stun exam next friday keep touch sorri

[array(['alway', 'behalf', 'day', 'easili', 'exam', 'friday', 'go',
       'great', 'keep', 'length', 'moji', 'next', 'offer', 'sorri',
       'speechless', 'stun', 'told', 'touch', 'wish'], dtype='<U34')]

[ 162  458 1140 1398 1552 1790 1921 1991 2541 2676 3047 3214 3328 4456
 4483 4620 4944 4984 5405]

[[1 1 1 1 1 1 1 2 1 1 1 1 2 1 1 1 1 1 1]]


## Step 4: training the classifier and making predictions for the test set

In [43]:

#MultinomialNB
clf=MultinomialNB()
clf.fit(x_train_df,y_train)
prediction_train=clf.predict(x_train_df)
prediction_test=clf.predict(x_test_df)

## Step 5: computing accuracy and confusion matrix

In [44]:
#scores
print("Accuracy:"+str(accuracy_score(y_train,prediction_train)))
print()

Accuracy:0.9923715503702042



We care about the generalisation error, that is the performance on unseen data.

In [90]:

#scores
print("Accuracy:"+str(accuracy_score(y_test,prediction_test)))
print()

conf_mat=confusion_matrix(y_test, prediction_test)
print("Confusion Matrix")
print(conf_mat)


Accuracy:0.8275

Confusion Matrix
[[169  29]
 [ 40 162]]


#### Where can we find sparse matrices ?
You can manipulate them using scipy.sparse

In [6]:
import scipy.sparse as sc #this is the library

#x_train_df is a scipy sparse matrix, this avoids to store the zeroes
#to access to the non-zero element
i=0# email index
ind=sc.find(x_train_df[i,:]>0)[1]
print("indexes of non-zeroes elements=",ind)
x_train_df[0,ind].todense()


indexes of non-zeroes elements= [ 162  458 1140 1398 1552 1790 1921 1991 2541 2676 3047 3214 3328 4456
 4483 4620 4944 4984 5405]


matrix([[1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1]],
       dtype=int64)

In [32]:
#test set
ind=sc.find(x_test_df[i,:]>0)[1]
print("indexes of non-zeroes elements=",ind)
x_test_df[0,ind].todense()


indexes of non-zeroes elements= [2870 3588]


matrix([[1, 1]], dtype=int64)

## Question

We consider Movie Reviews Corpus, a dataset that includes  movie reviews that are categorized as positive or negative.

In [94]:
import random
import nltk

nltk.download('movie_reviews')
from nltk.corpus import movie_reviews
df = pd.DataFrame(columns=['v1', 'v2'])
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        df=df.append({'v1': category, 'v2': movie_reviews.words(fileid)}, ignore_index=True)
        
word_list=[sms for sms in df['v2']]
def normalize(words):
    current_words=list()
    for word in words:
        if word.lower() not in stop: #remove  the most common words
            if word.isalpha(): #remove punctuation
                updated_word=stemmer.stem(word) #stemming
                current_words.append(updated_word.lower()) #lower case
    return current_words
word_list=[normalize(word) for word in word_list]
df['v2']=[" ".join(word) for word in word_list]


[nltk_data] Downloading package movie_reviews to
[nltk_data]     /home/benavoli/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


In [95]:
df['v2']

0       plot two teen coupl go church parti drink driv...
1       happi bastard quick movi review damn bug got h...
2       movi like make jade movi viewer thank invent t...
3       quest camelot warner bros first featur length ...
4       synopsi mental unstabl man undergo psychothera...
5       capsul planet mar polic take custodi accus mur...
6       ask eight millimet realli wholesom surveil man...
7       exact long movi felt even nine laugh nine mont...
8       call road trip walk wound stellan skarsg rd pl...
9       plot young french boy see parent kill eye tim ...
10      best rememb underst perform dr hannib lecter m...
11      janean garofalo romant comedi good idea coupl ...
12      high fli hong kong style filmmak made way clas...
13      movi like mortal kombat annihil work must revi...
14      femm la femm nikita baldwin backdraft sliver f...
15      john carpent make b movi alway halloween escap...
16      realli start wonder alicia silverston sure one...
17      get mi

Using the same steps as in the Spam filter example, apply MultinomialNB to this example