In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer #------ For Featuring
from sklearn.metrics import accuracy_score

# Data Preprocessing

In [2]:
# Loading the dataset
raw_mail_data=pd.read_csv('spam.csv')

In [3]:
raw_mail_data

Unnamed: 0,Category,Message,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will ?_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


**Replacing Pandas or Numpy Nan with  " " i.e null string**


In [4]:
# Replacing the null values with null string
mail_data=raw_mail_data.fillna('')
mail_data

Unnamed: 0,Category,Message,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will ?_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [5]:
mail_data.head()

Unnamed: 0,Category,Message,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [6]:
mail_data['Category'].value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

# Labelling
* Label spam mail as '0' and Non-spam(ham) mail as '1'
* “replace values with loc pandas” (For Specific column)

In [8]:
# mail_data.loc[mail_data['Category']=='spam',"Category"]=0
# mail_data.loc[mail_data['Category']=='ham',"Category"]=1

mail_data.Category.replace(('spam', 'ham'), (1, 0), inplace=True)

In [9]:
mail_data

Unnamed: 0,Category,Message,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,0,"Go until jurong point, crazy.. Available only ...",,,
1,0,Ok lar... Joking wif u oni...,,,
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,0,U dun say so early hor... U c already then say...,,,
4,0,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...,,,
5568,0,Will ?_ b going to esplanade fr home?,,,
5569,0,"Pity, * was in mood for that. So...any other s...",,,
5570,0,The guy did some bitching but I acted like i'd...,,,


# Separate Features and Labels


In [10]:
X=mail_data["Message"] #----- Feature
Y=mail_data["Category"] #---- label

# Train Test Split

In [11]:
from sklearn.model_selection import train_test_split

In [31]:
                # Test size = 20% and Train size = 80%
X_train,X_test,Y_train,Y_test=train_test_split(X,Y, test_size=0.4,random_state=1)
print("X:",X.shape,"\tX Train:",X_train.shape,"\tX Test: ",X_test.shape)


X: (5572,) 	X Train: (3343,) 	X Test:  (2229,)


# Feature Extraction
* Transform the text data to feature vectors that can be used as input to SVM model using TfidVectorize
* Basicaly Coverts the text to some numeric form which is essential for understanding our model for prediction.

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
X_train

5213                               3 pa but not selected.
2017    Better than bb. If he wont use it, his wife wi...
251     Wen ur lovable bcums angry wid u, dnt take it ...
4522    Actually I decided I was too hungry so I haven...
1090    WIN URGENT! Your mobile number has been awarde...
                              ...                        
905     Hey what's up charles sorry about the late reply.
5192    Oh oh... Den muz change plan liao... Go back h...
3980    Huh i cant thk of more oredi how many pages do...
235     I have printed it oh. So  &lt;#&gt;  come upst...
5157                              K k:) sms chat with me.
Name: Message, Length: 3343, dtype: object

In [33]:
feature_extract=TfidfVectorizer(min_df=1,lowercase=True,stop_words="english")
X_train_feature=feature_extract.fit_transform(X_train)
X_train_feature

<3343x6288 sparse matrix of type '<class 'numpy.float64'>'
	with 25893 stored elements in Compressed Sparse Row format>

In [34]:
X_train_feature=feature_extract.fit_transform(X_train)
X_test_feature=feature_extract.transform(X_test)
Y_train=Y_train.astype('int')
Y_test=Y_test.astype('int')
print(X_train_feature.shape)
print(X_test_feature.shape)

(3343, 6288)
(2229, 6288)


# Training the Model using SVM

In [35]:
from sklearn.svm import LinearSVC
model=LinearSVC()
model.fit(X_train_feature,Y_train)
# print(X_train_feature)

LinearSVC()

In [36]:
# Prediction on Training Data
predict_training_model=model.predict(X_train_feature)
accuracy=accuracy_score(Y_train,predict_training_model)
print("Accuracy Score on Training Data : ",accuracy)

Accuracy Score on Training Data :  1.0


In [37]:
# Prediction on Testing Data
predict_testing_model=model.predict(X_test_feature)
accuracy=accuracy_score(Y_test,predict_testing_model)
print("Accuracy Score on Testing Data : ",accuracy)

Accuracy Score on Testing Data :  0.9820547330641544


# Prediction on New Mail

In [18]:
Mail=["Your free ringtone is waiting to be collected. Simply text the password \MIX\" to 85069 to verify. Get Usher and Britney. FML, PO Box 5249, MK17 92H. 450Ppw 16"]

In [19]:
# Converting Text into Vectors
Mail_feature=feature_extract.transform(Mail)
print(Mail_feature.shape)

(1, 7469)


In [20]:
prediction=model.predict(Mail_feature)
print(prediction)

[0]


In [21]:
if prediction==1: 
    print("You Have Received a Ham Mail....\nOpen It ASAP!!!!")
else:
    print("You have Received a Spam Mail \nxxxxx...Please Ignore...xxxxx ")

You have Received a Spam Mail 
xxxxx...Please Ignore...xxxxx 
