In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
import warnings
warnings.filterwarnings('ignore')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import defaultdict
from wordcloud import WordCloud
from sklearn.metrics import confusion_matrix,f1_score


In [2]:
#data = pd.read_csv(r'C:\Users\methu\OneDrive\Desktop\smp.csv',encoding='ISO 8859-1',header=None)
data = pd.read_csv(r'C:\Users\methu\OneDrive\Desktop\smp.csv', encoding = 'latin1',header=None)
data.rename(columns={1:'text_message', 0:'v1'}, inplace=True)

In [3]:
data.head()

Unnamed: 0,v1,text_message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
data1 = data[['v1','text_message']]
data1.dropna(inplace=True)
data1['v1'].value_counts()

ham     4827
spam     747
Name: v1, dtype: int64

In [5]:
# remove whitespaces
data1['text_message']=data1['text_message'].str.strip()


In [6]:
# lowercase the text
data1['text_message'] = data1['text_message'].str.lower()

In [7]:
#remove punctuation
punc = string.punctuation
table = str.maketrans('','',punc)
data1['text_message']=data1['text_message'].apply(lambda x: x.translate(table))


In [8]:
# tokenizing each message
data1['word_tokens']=data1.apply(lambda x: x['text_message'].split(' '),axis=1)


In [9]:
# removing stopwords
data1['cleaned_text'] = data1.apply(lambda x: [word for word in x['word_tokens'] if word not in stopwords.words('english')]
                                    ,axis=1)

In [13]:
# stemming
ps = PorterStemmer()
data1['stemmed']= data1.apply(lambda x: [ps.stem(word) for word in x['cleaned_text']],axis=1)


In [14]:
# remove single letter words
data1['final_text'] = data1.apply(lambda x: ' '.join([word for word in x['stemmed'] if len(word)>1]),axis=1)


In [15]:
# label encoding ham=0 and spam=1
data1.loc[data1['v1']=='ham','v1']=0
data1.loc[data1['v1']=='spam','v1']=1

In [16]:
# divide the set in training and test
from sklearn.model_selection import train_test_split
X,X_test,y,y_test = train_test_split(data1.loc[:,'text_message':],data1['v1'],test_size=0.2)

In [17]:
# Now we'll create a vocabulary for the training set with word count
vocab=defaultdict(int) 
for text in X['final_text'].values:
    for elem in text.split(' '):
        vocab[elem]+=1


In [18]:
# Now we look at the types of words in ham and spam.
ham_text=' '.join(X.loc[y==0,'final_text'].values)
ham_wordcloud = WordCloud(background_color='white',max_words=2000).generate(ham_text)
spam_text=' '.join(X.loc[y==1,'final_text'].values)
spam_wordcloud = WordCloud(background_color='white',max_words=2000).generate(spam_text)

In [19]:
# tokenize the text for further calculations
X['tokenized_final_text']=X['final_text'].str.split(' ')
X_test['tokenized_final_text']=X_test['final_text'].str.split(' ')

In [20]:
# document frequency(number of docs containing word w) and Inverse document frequency(measures rarity of each word)
df={}
for k in vocab.keys():
    df[k]=np.sum(X['tokenized_final_text'].apply(lambda x: 1 if k in x else 0))
    
# Now we'll calculate the idf score of each word
idf = {k:1+np.log((1+X.shape[0]/(1+v))) for k,v in df.items()}

In [21]:
# tf * idf
for elem in vocab.keys():
    X[elem]= X['tokenized_final_text'].apply(lambda x: x.count(elem)*idf[elem] if elem in x else 0)
for elem in vocab.keys():
    X_test[elem]= X_test['tokenized_final_text'].apply(lambda x: x.count(elem)*idf[elem] if elem in x else 0)

In [22]:
y=y.astype('int')
y_test = y_test.astype('int')


In [23]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X.iloc[:,6:],y)
confusion_matrix(y,lr.predict(X.iloc[:,6:]))

array([[3851,    0],
       [   0,  608]], dtype=int64)

In [24]:
confusion_matrix(y_test,lr.predict(X_test.iloc[:,6:]))

array([[976,   0],
       [ 14, 125]], dtype=int64)

In [33]:
print('******logistic regression*************')
print('f1 score ----------',f1_score(y_test,lr.predict(X_test.iloc[:,6:])))
print('accuracy score--------',lr.score(X_test.iloc[:,6:],y_test))



******logistic regression*************
f1 score ---------- 0.9469696969696969
accuracy score-------- 0.9874439461883409


In [34]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(max_depth=70,random_state=3)
rf.fit(X.iloc[:,6:],y)
confusion_matrix(y,rf.predict(X.iloc[:,6:]))

array([[3851,    0],
       [  45,  563]], dtype=int64)

In [35]:
confusion_matrix(y_test,rf.predict(X_test.iloc[:,6:]))

array([[975,   1],
       [ 27, 112]], dtype=int64)

In [36]:
print('************random forest***********')
print('f1 score-----------',f1_score(y_test,rf.predict(X_test.iloc[:,6:])))
print('accuracy score--------',rf.score(X_test.iloc[:,6:],y_test))

************random forest***********
f1 score----------- 0.8888888888888888
accuracy score-------- 0.9748878923766816
