## **Mount Google Drive**

In [242]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)



Mounted at /content/drive


## **Importing packages**

In [243]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score,precision_score


# **Read Dataset**

In [244]:
df=pd.read_csv('/content/drive/MyDrive/Email_checking_project/mail_data.csv')


In [245]:
df.head(10)

Unnamed: 0,Category,Message
0,non-spam,"Go until jurong point, crazy.. Available only ..."
1,non-spam,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,non-spam,U dun say so early hor... U c already then say...
4,non-spam,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,non-spam,Even my brother is not like to speak with me. ...
7,non-spam,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [246]:
df.shape

(5572, 2)

## **Dataset Preprocessing**

In [247]:
#Check the total missing values
df.isnull().sum()

Category    0
Message     0
dtype: int64

In [248]:
#Check Duplicates values
df.duplicated().sum()

415

In [249]:
#remove duplicates
df.drop_duplicates(inplace=True)
df.shape

(5157, 2)

In [250]:
df.shape


(5157, 2)

In [251]:
# label spam mail as 0 and non-spam mail as 1

df.loc[df['Category'] == 'spam', 'Category',] = 0
df.loc[df['Category'] == 'non-spam', 'Category',] = 1

## **Feature Selection**

In [252]:
cv = CountVectorizer()
tfidf = TfidfVectorizer(max_features=300)

In [253]:
X= df['Message'].values
Y= df['Category'].values

In [254]:
print(X)

['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'
 'Ok lar... Joking wif u oni...'
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"
 ... 'Pity, * was in mood for that. So...any other suggestions?'
 "The guy did some bitching but I acted like i'd be interested in buying something else next week and he gave it to us for free"
 'Rofl. Its true to its name']


In [255]:
print(Y)

[1 1 0 ... 1 1 1]


## **Splitting the dataset into training and testing variables**

In [256]:
# 70% data is used for training and 30% for testing
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=2)

In [257]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(5157,)
(3609,)
(1548,)


## **Apply Spam Filtering Algorithm**


*   Multinomial NB
*   Decision Tree Classifier



### **Multinomial NB**

In [258]:
mnb= MultinomialNB()

feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english')

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [259]:
mnb.fit (X_train_features,Y_train)


In [260]:
#Fitting the data to the classifier
mnb.fit(X_train_features,Y_train)
#Predict on test data
Y_pred1 = mnb.predict(X_test_features)
print("Accuracy score:",accuracy_score(Y_test,Y_pred1)*100)
print("Precision score:",precision_score(Y_test,Y_pred1)*100)
print("Confusion Matrix:")
print(confusion_matrix(Y_test,Y_pred1))

Accuracy score: 96.31782945736434
Precision score: 95.95170454545455
Confusion Matrix:
[[ 140   57]
 [   0 1351]]


In [261]:
#convert text to feature vectors
input_mail =["Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"]
input_data_features = feature_extraction.transform(input_mail)

# making prediction

prediction = mnb.predict(input_data_features)
print(prediction)


if (prediction[0]==1):
  print('Non Spam mail')

else:
  print('Spam mail')

[0]
Spam mail


### **Decision Tree Classifier**

In [262]:
dtc = DecisionTreeClassifier(max_depth=5)

#Fitting the data to the classifier
dtc.fit(X_train_features,Y_train)
#Predict on test data
Y_pred2 = dtc.predict(X_test_features)
print("Accuracy score:",accuracy_score(Y_test,Y_pred2)*100)
print("Precision score:",precision_score(Y_test,Y_pred2)*100)
print("Confusion Matrix:")
print(confusion_matrix(Y_test,Y_pred2))

Accuracy score: 92.82945736434108
Precision score: 92.69972451790633
Confusion Matrix:
[[  91  106]
 [   5 1346]]


In [263]:
#convert text to feature vectors
input_mail =["Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"]
input_data_features = feature_extraction.transform(input_mail)

# making prediction

prediction = dtc.predict(input_data_features)
print(prediction)


if (prediction[0]==1):
  print('Non Spam mail')

else:
  print('Spam mail')

[0]
Spam mail
