# Known Libraries imported

- **Pandas** :  For data processing, CSV file I/O (e.g. pd.read_csv)
- **Numpy**  :  For linear algebra
- **Matplotlib** : For Data visualization
- **sklearn.model_selection**  : For spliting data in Train & Test
- **sklearn.linear_mode.LogisticRegression**   : For Logistic Regression 
- **sklearn.metrics**  : Evaluation metrics 

In [None]:
import pandas as pd
import numpy as np
import copy
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression  # For Logistic Regression
from sklearn.ensemble import RandomForestClassifier # For RFC
from sklearn.svm import SVC                               #For SVM
from sklearn.metrics import matthews_corrcoef    
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.metrics import accuracy_score,roc_curve,auc
sns.set(style="ticks", color_codes=True)

## Loading complete data in Panda's Dataframe

In [None]:
df = pd.read_csv("../input/phishing-data/combined_dataset.csv")

In [None]:
df.head()

### Co-relation matrix

In [None]:
df.corr()
df.corr()['label'].sort_values()

### Heat Map

In [None]:
#plt.figure(figsize = ('8','8'))
sns.heatmap(df.corr(),annot=True)

## Prepration Of Data

### Feature Selection 
- Taking all the features in count 

In [None]:
X= df.drop(['label', 'domain'], axis=1)
Y= df.label

- Split the data as training and testing data - 60% train size, 40% test size

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size=0.40)

### Model Training

## 1: Logistic Regression

In [None]:
LogReg1=LogisticRegression(random_state= 0, multi_class='multinomial' , solver='newton-cg')
#Train the model using training data 
LogReg1.fit(x_train,y_train)


#Test the model using testing data
y_pred_log = LogReg1.predict(x_test)

# cm=confusion_matrix(y_test,y_pred_log)
# sns.heatmap(cm,annot=True)
print("The accuracy Logistic Regression on testing data is: ",100.0 *accuracy_score(y_test,y_pred_log))

In [None]:
fpr,tpr,thresh = roc_curve(y_test,y_pred_log)
roc_auc = accuracy_score(y_test,y_pred_log)

# Plot ROC curve for Logistic Regression
plt.plot(fpr,tpr,'orange',label = 'Logistic Regression')
plt.legend("Logistic Regression", loc='lower right')
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.legend(loc='lower right')

## 2: Random Forest Classifier

In [None]:
#create RFC object
RFClass = RandomForestClassifier()
#Train the model using training data 
RFClass.fit(x_train,y_train)

#Test the model using testing data
y_pred_rfc = RFClass.predict(x_test)

# cm=confusion_matrix(y_test,y_pred_rfc)
# sns.heatmap(cm,annot=True)
print("The accuracy Random forest classifier on testing data is: ",100.0 *accuracy_score(y_test,y_pred_rfc))

In [None]:
fpr,tpr,thresh = roc_curve(y_test,y_pred_rfc)
roc_auc = accuracy_score(y_test,y_pred_rfc)

# Plot ROC curve for Random Forest 
plt.plot(fpr,tpr,'orange',label = 'Random Forest Classification')
plt.legend("Logistic Regression", loc='lower right')
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.legend(loc='lower right')

## 3: SVM

In [None]:
#create SVM object

svc = SVC()

svc.fit(x_train,y_train)
y_pred_svc = svc.predict(x_test)

# cm=confusion_matrix(y_test,y_pred_svc)
# sns.heatmap(cm,annot=True)
print("The accuracy SVM on testing data is: ",100.0 *accuracy_score(y_test,y_pred_svc))

In [None]:
fpr,tpr,thresh = roc_curve(y_test,y_pred_svc)
roc_auc = accuracy_score(y_test,y_pred_svc)

# Plot ROC curve for SVC
plt.plot(fpr,tpr,'orange',label = 'Support Vector Machine')
plt.legend("Logistic Regression", loc='lower right')
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.legend(loc='lower right')

In [None]:
print("The accuracy Logistic Regression on testing data is: ",100.0 *accuracy_score(y_test,y_pred_log))
print("The accuracy Random forest classifier on testing data is: ",100.0 *accuracy_score(y_test,y_pred_rfc))
print("The accuracy SVM on testing data is: ",100.0 *accuracy_score(y_test,y_pred_svc))