# Email Phishing data Exploration

## Importing the libraries

In [7]:
import pandas as pd
import sklearn as skl

## Loading the dataset

**Description**:
1. *num_words* - Total number of words in the email body
2. *num_unique_words* - Count of unique words used
3. *num_stopwords* - Count of common stopwords (e.g., "the", "and", "in")
4. *num_links* - Number of hyperlinks detected
5. *num_unique_domains* - Number of unique domains in links (e.g., "paypal.com")
6. *num_email_addresses* - Count of email addresses found in the text
7. *num_spelling_errors* - Count of misspelled words
8. *num_urgent_keywords* - Number of urgent words (e.g., "urgent", "verify", "update")
9. *label* - <u>Target variable</u>: 0 = Safe Email, 1 = Phishing Email

In [2]:
data=pd.read_csv('email_phishing_data.csv')
data

Unnamed: 0,num_words,num_unique_words,num_stopwords,num_links,num_unique_domains,num_email_addresses,num_spelling_errors,num_urgent_keywords,label
0,140,94,52,0,0,0,0,0,0
1,5,5,1,0,0,0,0,0,0
2,34,32,15,0,0,0,0,0,0
3,6,6,2,0,0,0,0,0,0
4,9,9,2,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
524841,782,327,301,2,2,2,52,1,0
524842,36,30,11,0,0,0,4,0,1
524843,61,46,11,0,0,0,3,0,0
524844,213,136,89,0,0,0,18,0,0


## Data Cleaning

### Removing the duplicates

In [5]:
data.drop_duplicates(keep='first',inplace=True,ignore_index=True)
data

Unnamed: 0,num_words,num_unique_words,num_stopwords,num_links,num_unique_domains,num_email_addresses,num_spelling_errors,num_urgent_keywords,label
0,140,94,52,0,0,0,0,0,0
1,5,5,1,0,0,0,0,0,0
2,34,32,15,0,0,0,0,0,0
3,6,6,2,0,0,0,0,0,0
4,9,9,2,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
205047,98,71,39,0,0,0,3,0,0
205048,65,52,22,2,2,0,6,1,0
205049,782,327,301,2,2,2,52,1,0
205050,36,30,11,0,0,0,4,0,1


### Checking for missing values

In [6]:
data.isnull().sum()

num_words              0
num_unique_words       0
num_stopwords          0
num_links              0
num_unique_domains     0
num_email_addresses    0
num_spelling_errors    0
num_urgent_keywords    0
label                  0
dtype: int64

## Model Building

**Description**:\
Predicting whether a given email is *Safe(0)* or *Phishing(1)*

In [28]:
X = data[[i for i in data.columns if i!='label']]
Y = data['label']
X_train,X_test,Y_train,Y_test= skl.model_selection.train_test_split(X,Y,test_size=0.01,random_state=0)

In [29]:
model=skl.linear_model.LogisticRegression(max_iter=50000)
model

In [30]:
model.fit(X_train,Y_train)

In [31]:
Y_train_pred = model.predict(X_train)

In [32]:
Y_test_pred = model.predict(X_test)

In [33]:
print(skl.metrics.classification_report(Y_train,Y_train_pred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98    196989
           1       0.33      0.00      0.00      6012

    accuracy                           0.97    203001
   macro avg       0.65      0.50      0.49    203001
weighted avg       0.95      0.97      0.96    203001



In [34]:
print(skl.metrics.classification_report(Y_test,Y_test_pred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98      1982
           1       0.00      0.00      0.00        69

    accuracy                           0.97      2051
   macro avg       0.48      0.50      0.49      2051
weighted avg       0.93      0.97      0.95      2051



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
