### Introduction
##### problem: classifying emails as spam or ham(non-spam).
##### Objective: Develop a machine learning model to accurately classify emails.
##### Importance: Effective email filtering is crucial for maintaining inbox cleanliness and security.


### Importing Dependencies

In [32]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

### Data Collection and Preprocessing

In [33]:
# read data from a csv file and create a dataframe object
data= pd.read_csv(r"C:\Users\PHOENIX\OneDrive\Desktop\ML projects\mail_data.csv")

In [34]:
# to display the first few rows of a dataframe
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [35]:
# sumof missing values(NANs) in each column of a dataframe
data.isnull().sum()

Category    0
Message     0
dtype: int64

### Label Encoding

In [36]:
# encoding the categorical variables into numerical format
label= LabelEncoder()

In [37]:
#which is used to both fit the encoder to the data and transform the categorical variables into numerical labels simultaneously
label_encoding= label.fit_transform(data["Category"])

In [61]:
#0 is for ham
#1 is for spam
label_encoding

array([0, 0, 1, ..., 0, 0, 0])

In [39]:
# seperating the data as texts and label
x=data["Message"]
y=label_encoding

### Splitting data into training data and testing data

In [40]:
# splitting datasets into training and testing sets for machine learning model training and evaluation
x_train,x_test,y_train,y_test= train_test_split(x,y,test_size=0.2,random_state=3,stratify=y)

In [41]:
print(x.shape,x_train.shape,x_test.shape)

(5572,) (4457,) (1115,)


### Feature extraction

In [44]:
# transform the text data to feature vectors that can be used as input to the logistic regression
feature_extraction= TfidfVectorizer(min_df=1,stop_words="english",lowercase=True)

In [45]:
# transform the text data to feature vectors that can be used as input to the logistic regression
feature_extraction= TfidfVectorizer(stop_words="english")

In [46]:
feature_xtrain= feature_extraction.fit_transform(x_train)
feature_xtest= feature_extraction.transform(x_test)

In [47]:
print(x_train)

2020               From tomorrow onwards eve 6 to 3 work.
3453                             Ugh just got outta class
3364                                   Can... I'm free...
2597    No i'm not gonna be able to. || too late notic...
5491    U studying in sch or going home? Anyway i'll b...
                              ...                        
3775                                Ok... But bag again..
5519    Can you pls send me that company name. In saib...
2832                           Thanx 4 sending me home...
2724    Tunde, how are you doing. This is just wishing...
4714    Big brother‘s really scraped the barrel with t...
Name: Message, Length: 4457, dtype: object


In [48]:
print(feature_xtrain)

  (0, 7329)	0.39151450331197035
  (0, 2596)	0.5157331716075019
  (0, 4795)	0.6459507464707183
  (0, 6736)	0.40433070936297943
  (1, 1793)	0.43486660333673016
  (1, 4861)	0.596185515774092
  (1, 3112)	0.31103507183699425
  (1, 3758)	0.2826422333927384
  (1, 6887)	0.528038275197618
  (2, 2903)	1.0
  (3, 5081)	0.4169087023760639
  (3, 7198)	0.3971508483254661
  (3, 3373)	0.26859638268284747
  (3, 4040)	0.24099748417300504
  (3, 4692)	0.43001182720880177
  (3, 3909)	0.3260348921371232
  (3, 758)	0.37620667903348365
  (3, 3092)	0.32479862316475455
  (4, 3911)	0.2511783165875194
  (4, 3082)	0.4766800108257892
  (4, 5766)	0.6833422922401592
  (4, 6339)	0.37251069778964124
  (4, 3373)	0.23999265394731062
  (4, 4040)	0.21533285461106833
  (5, 4413)	0.4460096390714086
  :	:
  (4452, 1180)	0.8777703340143531
  (4452, 4770)	0.4790816639408472
  (4453, 1853)	0.5659242420057378
  (4453, 5704)	0.5659242420057378
  (4453, 1891)	0.4268643677817285
  (4453, 5098)	0.31370317391845537
  (4453, 5831)	0.280

### Training the model
#### Logistic regression

In [49]:
# Logistic Regression is a statistical method used for binary classification problems
model= LogisticRegression()

<IPython.core.display.Javascript object>

In [50]:
# training the logistic regression model with the training data
model.fit(feature_xtrain,y_train)

### Evaluating the trained model

In [57]:
# prediction on training data
xtrain_predict= model.predict(feature_xtrain)

In [58]:
accuracy_score(y_train,train_predict)

0.9681400044873233

In [59]:
# prediction on test data
xtest_predict= model.predict(feature_xtest)

In [60]:
accuracy_score(y_test,test_predict)

0.9641255605381166

### Building a predictive system

In [56]:
input_data= [" Nah I don't think he goes to usf, he lives around here though" ]
# converting a text into feature vectors
feature= feature_extraction.transform(input_data)
# making prediction
prediction= model.predict(feature)
print(prediction)
if (prediction[0]==0):
    print("Ham mail")
else:
    print("Spam mail")

[0]
Ham mail
