In [136]:
import pandas as pd
from sklearn.utils import resample
import string
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB,BernoulliNB,GaussianNB
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import accuracy_score
from sklearn import svm

In [137]:
df=pd.read_csv('sms.tsv',sep='\t',names=['Type','Message'])

In [138]:
df.head()

Unnamed: 0,Type,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [139]:
df.isnull().sum()

Type       0
Message    0
dtype: int64

In [140]:
df['Type'].value_counts()

ham     4825
spam     747
Name: Type, dtype: int64

In [141]:
le=LabelEncoder()

In [142]:
df['Type']=le.fit_transform(df['Type'])

In [143]:
df.head()

Unnamed: 0,Type,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [144]:
feature=df['Type']
data=df['Message']

In [145]:
df_majority = df[df.Type==0]
df_minority = df[df.Type==1]


In [146]:
df_majority.shape

(4825, 2)

In [147]:
df_minority.shape

(747, 2)

In [148]:
#Increasing samples of spam class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=4825,    # to match majority class
                                 random_state=123)

In [149]:
df_minority_upsampled.shape

(4825, 2)

In [150]:
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

In [151]:
df_upsampled.shape

(9650, 2)

In [152]:
df_upsampled

Unnamed: 0,Type,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
6,0,Even my brother is not like to speak with me. ...
...,...,...
357,1,Congratulations ur awarded 500 of CD vouchers ...
2954,1,"URGENT! Your mobile was awarded a £1,500 Bonus..."
1129,1,Ur HMV Quiz cash-balance is currently £500 - t...
4359,1,Our dating service has been asked 2 contact U ...


In [153]:
feature=df_upsampled['Message']

In [154]:
feature.shape

(9650,)

In [155]:
data=df_upsampled['Type']

In [156]:
data.shape

(9650,)

In [157]:
def pre_process(text):
    
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = [word for word in text.split() if word.lower() not in stopwords.words('english')]
    words = ""
    for i in text:
            stemmer = SnowballStemmer("english")
            words += (stemmer.stem(i))+" "
    return words

In [158]:
textFeatures = df_upsampled['Message'].copy()
textFeatures = textFeatures.apply(pre_process)


In [159]:
textFeatures

0       go jurong point crazi avail bugi n great world...
1                                  ok lar joke wif u oni 
3                    u dun say earli hor u c alreadi say 
4              nah dont think goe usf live around though 
6          even brother like speak treat like aid patent 
                              ...                        
357     congratul ur award 500 cd voucher 125gift guar...
2954    urgent mobil award £1500 bonus caller prize 27...
1129    ur hmv quiz cashbal current £500 maxim ur cash...
4359    date servic ask 2 contact u someon shi call 09...
2663    hello darl today would love chat dont tell loo...
Name: Message, Length: 9650, dtype: object

In [160]:
vectorizer = TfidfVectorizer(stop_words='english',ngram_range=(1,2),min_df=2)
feature = vectorizer.fit_transform(textFeatures)
#data = vectorizer.transform(data)

In [161]:
feature.shape

(9650, 13129)

In [162]:
xtrain,xtest,ytrain,ytest=train_test_split(feature,data,test_size=0.2,random_state=42)

In [163]:
le.inverse_transform([0,1])

array(['ham', 'spam'], dtype=object)

VARIOUS MODELS ARE IMPLEMENTED BELOW

In [182]:
clf=RandomForestClassifier(random_state=42)
clf.fit(xtrain,ytrain)
pred=clf.predict(xtest)
acc=accuracy_score(ytest,pred)
print(acc)
confusion_matrix(ytest,pred)



0.9989637305699481


array([[984,   1],
       [  1, 944]])

In [None]:
#Some genral cases to check classifier performance

In [183]:
new=['you won Rs.10000 call to get reward']#classifying Wrong
new=vectorizer.transform(new)
p=clf.predict(new.todense())
print(p)

[0]


In [166]:
clf1 = DecisionTreeClassifier(random_state=42)
clf1.fit(xtrain,ytrain)
pred=clf1.predict(xtest)
acc=accuracy_score(ytest,pred)
print(acc)
confusion_matrix(ytest,pred)

0.9860103626943005


array([[959,  26],
       [  1, 944]])

In [167]:
new=['how are you']
new=vectorizer.transform(new)
p=clf1.predict(new.todense())
print(p)

[0]


In [168]:
new=['you won Rs.10000 call to get reward']#classifying Wrong
new=vectorizer.transform(new)
p=clf1.predict(new.todense())
print(p)

[0]


In [184]:
log=LogisticRegression(class_weight={1:6})#best model
log.fit(xtrain,ytrain)
pred=log.predict(xtest)
print(accuracy_score(ytest,pred))
confusion_matrix(ytest,pred)

0.9803108808290155




array([[947,  38],
       [  0, 945]])

In [185]:
new=['free tickets to FA']
new=vectorizer.transform(new)
p=log.predict(new.todense())
print(p)

[1]


In [186]:
new=['Hello how are you ']
new=vectorizer.transform(new)
p=log.predict(new.todense())
print(p)

[0]


In [187]:
new=['you won Rs.10000 call to get reward']
new=vectorizer.transform(new)
p=log.predict(new.todense())
print(p)

[1]


In [188]:
clf = svm.SVC(C=1,kernel='linear')
clf.fit(xtrain,ytrain)
pred=clf.predict(xtest)
acc=accuracy_score(ytest,pred)
confusion_matrix(ytest,pred)

array([[982,   3],
       [  0, 945]])

In [189]:
new=['how are you']
new=vectorizer.transform(new)
p=clf.predict(new.todense())
print(p)

[0]


In [190]:
new=['you won Rs.10000 call to get reward']
new=vectorizer.transform(new)
p=clf.predict(new.todense())
print(p)

[0]


In [94]:
ga=GaussianNB()
ga.fit(xtrain.todense(),ytrain)
pred=ga.predict(xtest.todense())
confusion_matrix(ytest,pred)

array([[978,   7],
       [  0, 945]])

In [191]:
new=['you won Rs.10000 call to get reward']
new=vectorizer.transform(new)
p=clf.predict(new.todense())
print(p)

[0]
