In [15]:
import numpy as np
import pandas as pd                                            #used to create dataframes which helps us to structure eour data more
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer   # to convert text data into more meaningful data for machine or in more numerical way 
# so that our machine learning model can understand it using feature vectors
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier 
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

In [16]:
raw_mails = pd.read_csv("C:\\Users\\HP\\Desktop\\nirma sem 5\\machine learning\\ML special\\spam_ham_dataset.csv\\spam_ham_dataset.csv")

In [17]:
# we have to replace null values or missing values with null strings
mail_data = raw_mails.where((pd.notnull(raw_mails)),'')

# getting the first 5 rows of our dataset
mail_data.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [18]:
# checking number of rows and columns 

mail_data.shape

(5171, 4)

In [19]:
# label encoding that is we can term ham as 1 and spam as 0 just like true and false
mail_data.loc[mail_data['label'] == 'spam' , 'label',] = 0
mail_data.loc[mail_data['label'] == 'ham' , 'label',] = 1

# not spam = 1
# spam = 0

In [20]:
# separating the data as text and labels
# that is separating mails and the spam or ham column 
# 1 will be on each axis that is 0 or 1 will be on y axis and mails will be on x axis 

X = mail_data['text']

Y = mail_data['label']

In [21]:
print(X)


0       Subject: enron methanol ; meter # : 988291\r\n...
1       Subject: hpl nom for january 9 , 2001\r\n( see...
2       Subject: neon retreat\r\nho ho ho , we ' re ar...
3       Subject: photoshop , windows , office . cheap ...
4       Subject: re : indian springs\r\nthis deal is t...
                              ...                        
5166    Subject: put the 10 on the ft\r\nthe transport...
5167    Subject: 3 / 4 / 2000 and following noms\r\nhp...
5168    Subject: calpine daily gas nomination\r\n>\r\n...
5169    Subject: industrial worksheets for august 2000...
5170    Subject: important online banking alert\r\ndea...
Name: text, Length: 5171, dtype: object


In [22]:
print(Y)

0       1
1       1
2       1
3       0
4       1
       ..
5166    1
5167    1
5168    1
5169    1
5170    0
Name: label, Length: 5171, dtype: object


In [23]:
# splitting our data into training and testing data 
# first we will train our logistic regression model
# then we will test our data 

X_train , X_test , Y_train , Y_test = train_test_split(X,Y,test_size = 0.2,random_state = 3)

#test_size = 0.2 means that we are taking 20% data for testing 
# so every time we run the code our data will get splitted same as previous one but if we set random_state = 2 then everytime our data will be splitted differently 


In [24]:
print(X_train.shape)
print(X.shape)
print(X_test.shape)
print(Y_train)

(4136,)
(5171,)
(1035,)
2209    1
2000    1
5030    1
1376    1
1564    0
       ..
789     0
968     0
1667    1
3321    1
1688    0
Name: label, Length: 4136, dtype: object


In [34]:
# feature extraction 
# transform the text data into feature vectors that can be used as input to our logistic regression model
# so this vectorizer try to go through every word in your text and if suppose a word is repeated 1000 times then it will get a score or a value .
# similarly it will scores or value and some will get higher score and some will get lower score and so on

feature_extraction = TfidfVectorizer(min_df = 1,stop_words = 'english' , lowercase = 'true')

# min_df = 1 means if the score of word is less than one than we have to ignore it 
# stop words are those words which will be repeated many no. of times but do not have any meaning such as (and ,or, the etc)

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)    # here we will not fit the test data because we dont want our model to look at this data

# and for Y_test , Y_train we will convert all the values ie 0 1 to integer datatype and so that they will not be considered as string

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')


In [35]:
print(X_train)
print(X_train_features)

2209    Subject: hplc to wellhead\r\ndaren here is the...
2000    Subject: mobil chemical - hpl meter # 1256 - e...
5030    Subject: revised nom 5 / 5 - eastrans ; revise...
1376    Subject: re : exxon company , usa global # 960...
1564    Subject: your pharmacy nx\r\nwant a cheap pain...
                              ...                        
789     Subject: incr ' ease yo ' ur man ' hood by 4 -...
968     Subject: subscribers receive first notice on r...
1667    Subject: neon for march 28\r\nhere is the neon...
3321    Subject: re : first delivery - pure resources ...
1688    Subject: enhance your chest size\r\nemail is l...
Name: text, Length: 4136, dtype: object
  (0, 3871)	0.13387711316973605
  (0, 531)	0.14556222812251965
  (0, 30451)	0.08468916670398006
  (0, 43273)	0.14556222812251965
  (0, 3890)	0.14556222812251965
  (0, 548)	0.14556222812251965
  (0, 37262)	0.11275796314501375
  (0, 2908)	0.11535664415295803
  (0, 456)	0.14556222812251965
  (0, 26297)	0.09506000151609588
 

In [36]:
model = RandomForestClassifier(random_state = 2529)

In [37]:
model.fit(X_train_features , Y_train)

RandomForestClassifier(random_state=2529)

In [38]:
# evaluation
# prediction on training data

prediction_on_training_data = model.predict(X_train_features)

accuracy_on_training_data = accuracy_score(Y_train ,prediction_on_training_data)

In [39]:
print('accuracy on training data = ',accuracy_on_training_data)

accuracy on training data =  1.0


In [41]:
# prediction on test data
#so we are checking the accuracy on training and testing data to avoid overfitting of our model
X_test_count = feature_extraction.transform(X_test)
model.score(X_test_count,Y_test)
#prediction_on_test_data = model.predict(X_test_features)

#accuracy_on_test_data = accuracy_score(Y_test ,prediction_on_test_data)

0.9758454106280193

In [46]:
# building a predictive system

input_mail = ["i am not good","SIX chances to win CASH! From 100 to 20,000 pounds txt> CSH11 and send to 87575. Cost 150p/day, 6days, 16+ TsandCs apply Reply HL 4 info"]

# convert or transforming text to feature vector 

input_data_feature = feature_extraction.transform(input_mail)

#making predictions

prediction = model.predict(input_data_feature)
print(prediction)
print('\n')
for i in range(0,len(input_mail)):
    if prediction[i] == 1:
        print('not spam mail')
    else:
        print('spam mail')

[0 0]


spam mail
spam mail
