PROJECT:- PREDICTION OF SPAM MAIL 

In [4]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

DATA PREPROCESSING

In [8]:
# Load the dataset to pandas data frame 
raw_mail_data=pd.read_csv('spamham.csv')  #store in pandas data 

# Replace the null values with a null string
mail_data=raw_mail_data.where((pd.notnull(raw_mail_data)),' ')

mail_data.shape

mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [40]:
# Label mail data as 0= spam & 1=ham

mail_data.loc[mail_data['Category']=='spam','Category',]= 0

mail_data.loc[mail_data['Category']=='ham','Category',]= 1

A=mail_data['Message']
B=mail_data['Category']

print(A)
print(B)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object
0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object


In [42]:
# Train Test Split 
# split the data as train data and test data 

A_train,A_test,B_train,B_test = train_test_split(A,B, train_size=0.8,test_size=0.2,random_state=3)



In [49]:
# Feature Exraction 
# Transform the text data to feature vectors that used as input to the svm model using Tfidfvectorizer

# convert the case into lower letter
 
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase='True')
A_train_features = feature_extraction.fit_transform(A_train)
A_test_features = feature_extraction.transform(A_test)

#convert Y_train and Y_test values as integers
B_train = B_train.astype('int')
B_test = B_test.astype('int')

SUPPORT VECTOR MACHINE MODEL

In [48]:
# Training the SUPPORT VECTOR MACHINE model with training data 
model=SVC()
model.fit(A_train_features,B_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [50]:
#  Evaluation of the model
# Prediction on Training data 
prediction_on_training_data = model.predict(A_train_features)
accuracy_on_training_data = accuracy_score(B_train,prediction_on_training_data)
print('Accuracy on training data:',accuracy_on_training_data)



Accuracy on training data: 0.99798070450976


In [52]:
# Prediction on Test data
prediction_on_test_data=model.predict(A_test_features)
accuracy_on_test_data=accuracy_score(B_test,prediction_on_test_data)
print('Accuracy on test data:',accuracy_on_test_data)

Accuracy on test data: 0.979372197309417


In [55]:
# Predictio on New Mail 
input_mail=["Congrats! 1 year special cinema pass for 2 is yours. call 09061209465 now! C Suprman V, Matrix3, StarWars3, etc all 4 FREE! bx420-ip4-5we. 150pm. Dont miss out!"]

# Convert text to feature vector 
input_mail_feature=feature_extraction.transform(input_mail)

# Making prediction 
prediction=model.predict(input_mail_feature)
print(prediction)

if (prediction[0]==1):
  print('HAM MAIL')
else:
  print('SPAM MAIL')

[0]
SPAM MAIL
