In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("smsspamcollection.tsv",sep="\t")

In [3]:
df.head()#shows first 5 rows

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [4]:
#we still dont know, how to extract features, so we will work on the numerical features
#later we will convert the text into numerical informaiton, with text Feature Extraction

In [5]:
#ML models,require complete data, so to check if data is missing
#use 
df.isnull()
df.isnull().sum()
#False as 0, True means that , it is null, and it is 1

label      0
message    0
length     0
punct      0
dtype: int64

In [6]:
len(df)#number of rows

5572

In [7]:
df

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2
...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,160,8
5568,ham,Will ü b going to esplanade fr home?,36,1
5569,ham,"Pity, * was in mood for that. So...any other s...",57,7
5570,ham,The guy did some bitching but I acted like i'd...,125,1


In [8]:
#columns
df['label']
df['label'].unique()

array(['ham', 'spam'], dtype=object)

In [9]:
df['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [10]:
#we will build a machine learning model, that 
#will predict whether the text is span or ham by two 
#numerical features, length and punct

In [11]:
from sklearn.model_selection import train_test_split

In [45]:
df[['length','punct']]
df
df[['length']]
df[['label']][0:1]
df['label']

X = df[['length','punct']] #we are passing list 
type(X)

y=df['label']
type(y)

pandas.core.series.Series

In [57]:
#X feature data
X = df[['length','punct']] #we are passing list 
type(X)
#y is label
y=df['label']

train_test_split(X,y)#df,#series
#Xtrain set, X test set, y label train set,  y label test set 

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)
#selects randomly

In [58]:
X_train
X_train.shape

(3900, 2)

In [59]:
X_test.shape
X_test
#the index values in the X_test, that will help us match us y_test or X_test

Unnamed: 0,length,punct
3245,147,14
944,116,1
1044,102,3
2484,45,0
812,112,4
...,...,...
2505,160,10
2525,99,5
4975,62,5
650,97,11


In [60]:
y_test.shape

(1672,)

In [61]:
y_train.shape

(3900,)

In [62]:
from sklearn.linear_model import LogisticRegression

In [63]:
#create instance of model
lr_model = LogisticRegression(solver='lbfgs')

In [64]:
lr_model.fit(X_train,y_train)

LogisticRegression()

In [65]:
#test the accuracy of the model from the test data set
from sklearn import metrics

In [66]:
#grab the features from the test data set
predictions = lr_model.predict(X_test)
# we will predict on this using lr_model.predict(X_test)

In [67]:
predictions


array(['ham', 'ham', 'ham', ..., 'ham', 'ham', 'ham'], dtype=object)

In [69]:
y_test #true values

3245     ham
944      ham
1044     ham
2484     ham
812      ham
        ... 
2505     ham
2525    spam
4975     ham
650     spam
4463     ham
Name: label, Length: 1672, dtype: object

In [70]:
#now we need to compare the predictions with the y_test
# one way is to build the confusion matrix

In [72]:
metrics.confusion_matrix(y_test,predictions)
print(metrics.confusion_matrix(y_test,predictions))

[[1404   44]
 [ 219    5]]


In [73]:
pd.DataFrame(metrics.confusion_matrix(y_test,predictions),index=['ham','spam'],columns=['ham','spam'])

Unnamed: 0,ham,spam
ham,1404,44
spam,219,5


In [75]:
metrics.classification_report(y_test,predictions)
print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

         ham       0.87      0.97      0.91      1448
        spam       0.10      0.02      0.04       224

    accuracy                           0.84      1672
   macro avg       0.48      0.50      0.48      1672
weighted avg       0.76      0.84      0.80      1672



In [76]:
#by this we can say that, our model is very good at predicting ham messages
# and our model is very poor at detecting spam messages

In [77]:
metrics.accuracy_score(y_test,predictions)

0.8427033492822966

In [82]:
#SYNTAX to use any model

#import the model
from sklearn.naive_bayes import MultinomialNB

#create an instance of it
nb_model = MultinomialNB()

#fit the model
nb_model.fit(X_train,y_train)

#predict the model
predictions = nb_model.predict(X_test)

#use the predictinos with confusion matrix
print(metrics.confusion_matrix(y_test,predictions))

[[1438   10]
 [ 224    0]]


In [83]:
#this model also doesnt predict well ,
#now we are not able to identify any of the spam
#which means, it is 0 precision and 0 recall for the spam

#print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

         ham       0.87      0.99      0.92      1448
        spam       0.00      0.00      0.00       224

    accuracy                           0.86      1672
   macro avg       0.43      0.50      0.46      1672
weighted avg       0.75      0.86      0.80      1672



In [85]:
#new model

from sklearn.svm import SVC

svc_model = SVC(gamma='auto')

svc_model.fit(X_train,y_train)

predictions = svc_model.predict(X_test)

#evaluate the predictions
print(metrics.confusion_matrix(y_test,predictions))



[[1373   75]
 [ 121  103]]
