# Spam Detection with Logistic Regression

In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer

## Data Collection & Pre-Processing

#### Loading the data from csv file to a pandas DataFrame

In [6]:
raw_mail_data = pd.read_csv('spam_ham_dataset_.csv')
#print(raw_mail_data)

#### Replace the null values with a null string

In [8]:

mail_data = raw_mail_data.fillna('')


#### Printing the first 5 rows of the DataFrame

In [9]:

print(mail_data.head(10))


   Unnamed: 0 Category                                            Message  \
0         605      ham  Subject: enron methanol ; meter # : 988291\r\n...   
1        2349      ham  Subject: hpl nom for january 9 , 2001\r\n( see...   
2        3624      ham  Subject: neon retreat\r\nho ho ho , we ' re ar...   
3        4685     spam  Subject: photoshop , windows , office . cheap ...   
4        2030      ham  Subject: re : indian springs\r\nthis deal is t...   
5        2949      ham  Subject: ehronline web address change\r\nthis ...   
6        2793      ham  Subject: spring savings certificate - take 30 ...   
7        4185     spam  Subject: looking for medication ? we ` re the ...   
8        2641      ham  Subject: noms / actual flow for 2 / 26\r\nwe a...   
9        1870      ham  Subject: nominations for oct . 21 - 23 , 2000\...   

   label_num  
0          0  
1          0  
2          0  
3          1  
4          0  
5          0  
6          0  
7          1  
8          0  
9 

#### Checking the number of rows and columns in the DataFrame

In [10]:

print(mail_data.shape)


(5171, 4)


## Label Encoding

#### Label spam mail as 0, ham mail as 1

In [11]:

mail_data['Category'] = mail_data['Category'].map({'spam': 0, 'ham': 1})


#### Separating the data as texts and labels

In [12]:

X = mail_data['Message']
Y = mail_data['Category']

print(X)

0       Subject: enron methanol ; meter # : 988291\r\n...
1       Subject: hpl nom for january 9 , 2001\r\n( see...
2       Subject: neon retreat\r\nho ho ho , we ' re ar...
3       Subject: photoshop , windows , office . cheap ...
4       Subject: re : indian springs\r\nthis deal is t...
                              ...                        
5166    Subject: put the 10 on the ft\r\nthe transport...
5167    Subject: 3 / 4 / 2000 and following noms\r\nhp...
5168    Subject: calpine daily gas nomination\r\n>\r\n...
5169    Subject: industrial worksheets for august 2000...
5170    Subject: important online banking alert\r\ndea...
Name: Message, Length: 5171, dtype: object


In [35]:
print(Y)

0       1
1       1
2       1
3       0
4       1
       ..
5166    1
5167    1
5168    1
5169    1
5170    0
Name: Category, Length: 5171, dtype: int64


# Splitting the data into training data & test data

In [13]:

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)

print("Shape of Independet Data set",X.shape)
print("\nInput data Training Set Shape...:",X_train.shape)
print("\nInput data Testing Set Shape...:",X_test.shape)
print("\nShape of Dependet Data set",Y.shape)
print("\nOutput data Training Set Shape...:",Y_train.shape)
print("\nOutput data Testing Set Shape...:",Y_test.shape)

Shape of Independet Data set (5171,)

Input data Training Set Shape...: (4136,)

Input data Testing Set Shape...: (1035,)

Shape of Dependet Data set (5171,)

Output data Training Set Shape...: (4136,)

Output data Testing Set Shape...: (1035,)


## Feature Extraction

#### Transform the text data to feature vectors that can be used as input to Logistic regression

In [54]:


feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)



#### Convert Y_train and Y_test to integers

In [None]:


Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

print(X_train)
#print(X_train_features)


# Training the Model: Logistic Regression

In [59]:

model = LogisticRegression()
print("\nModel is Developed.....!")



Model is Developed.....!


## Training the Logistic Regression model with the training data

In [61]:

model.fit(X_train_features, Y_train)

print("\nModel is trained Successfully...!")


Model is trained Successfully...!


# Evaluating the trained model

### Prediction on training data

In [16]:

prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)

print('\nAccuracy on training data:', accuracy_on_training_data)



Accuracy on training data: 0.9968568665377177


### Prediction on test data

In [17]:

prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)

print('Accuracy on test data:', accuracy_on_test_data)



Accuracy on test data: 0.9806763285024155


## Testing the Model with Input Data

In [69]:

input_mail = ["I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times"]
"""Application Update 139 Spam"""
input_mail =["Subject: password reset"] #Row Num 104
#input_mail=["Application Update"]
# convert text to feature vectors
input_data_features = feature_extraction.transform(input_mail)


### Making Prediction

In [67]:

prediction = model.predict(input_data_features)
print(prediction)

print("Input Mail :- ",input_mail)


[1]
Input Mail :-  ['Subject: password reset']


In [76]:
print("\nResult : ",end='')
if (prediction[0]==1):
  print('Your Mail is Ham mail')

else:
  print('Your Mail is Spam mail')


Result : Your Mail is Ham mail
