# Logistic Regression applied to 
------------------------------------------------------------------------------------

## Imports and Setup
------------------------------------------

In [1]:
import numpy as np
import pandas as pd
import logisticregression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [234]:
#load the dataset
phishing_df=pd.read_csv("phishdata.csv")

In [235]:
phishing_df.head()

Unnamed: 0,index,having_IPhaving_IP_Address,URLURL_Length,Shortining_Service,having_At_Symbol,double_slash_redirecting,Prefix_Suffix,having_Sub_Domain,SSLfinal_State,Domain_registeration_length,...,popUpWidnow,Iframe,age_of_domain,DNSRecord,web_traffic,Page_Rank,Google_Index,Links_pointing_to_page,Statistical_report,Result
0,1,-1,1,1,1,-1,-1,-1,-1,-1,...,1,1,-1,-1,-1,-1,1,1,-1,-1
1,2,1,1,1,1,1,-1,0,1,-1,...,1,1,-1,-1,0,-1,1,1,1,-1
2,3,1,0,1,1,1,-1,-1,-1,-1,...,1,1,1,-1,1,-1,1,0,-1,-1
3,4,1,0,1,1,1,-1,-1,-1,1,...,1,1,-1,-1,1,-1,1,-1,1,-1
4,5,1,0,-1,1,1,-1,1,1,-1,...,-1,1,-1,-1,0,-1,1,1,1,1


In [236]:
phishing_df.columns

Index(['index', 'having_IPhaving_IP_Address', 'URLURL_Length',
       'Shortining_Service', 'having_At_Symbol', 'double_slash_redirecting',
       'Prefix_Suffix', 'having_Sub_Domain', 'SSLfinal_State',
       'Domain_registeration_length', 'Favicon', 'port', 'HTTPS_token',
       'Request_URL', 'URL_of_Anchor', 'Links_in_tags', 'SFH',
       'Submitting_to_email', 'Abnormal_URL', 'Redirect', 'on_mouseover',
       'RightClick', 'popUpWidnow', 'Iframe', 'age_of_domain', 'DNSRecord',
       'web_traffic', 'Page_Rank', 'Google_Index', 'Links_pointing_to_page',
       'Statistical_report', 'Result'],
      dtype='object')

In [237]:
#Split train and test, stratified by labels through sklearn function
train_df, test_df = train_test_split(phishing_df,test_size=0.2, stratify=phishing_df['Result'])

In [238]:
#Input features
inputs=['having_IPhaving_IP_Address', 'URLURL_Length',
       'Shortining_Service', 'having_At_Symbol', 'double_slash_redirecting',
       'Prefix_Suffix', 'having_Sub_Domain', 'SSLfinal_State',
       'Domain_registeration_length', 'Favicon', 'port', 'HTTPS_token',
       'Request_URL', 'URL_of_Anchor', 'Links_in_tags', 'SFH',
       'Submitting_to_email', 'Abnormal_URL', 'Redirect', 'on_mouseover',
       'RightClick', 'popUpWidnow', 'Iframe', 'age_of_domain', 'DNSRecord',
       'web_traffic', 'Page_Rank', 'Google_Index', 'Links_pointing_to_page',
       'Statistical_report']

In [239]:
#Input arrays
X_train=train_df[inputs]
X_test=test_df[inputs]

In [240]:
#response vector
y_train=train_df['Result']
y_test=test_df['Result']

Necessary adjustment as sigmoid is between 0 and 1

In [241]:
y_train.replace(-1,0, inplace=True)
y_test.replace(-1,0, inplace=True)

## Testing
------------------------------------------------------------------

In [250]:
theta, num_steps = logisticregression.fit(X_train, y_train, learn_rate=0.0001, convergance=0.0000001)

In [251]:
print('Weights:',theta, 'Iterations:', num_steps)

Weights: [[ 2.41238418  0.6445657  -0.22894249 -0.64197768  0.35838666 -0.01157414
   2.71539529  0.60820024  1.61429545  0.02442444 -0.38627543  0.67480543
  -0.44494064  0.23531329  3.16302009  0.82044985  0.88840587 -0.43730118
  -0.13254462 -1.22496855  0.21212971  0.11858907 -0.0242293  -0.27867822
   0.08241097  0.47693078  0.73641629  0.20732886  0.67218015  0.80243055
   0.34607257]] Iterations: 3210


In [252]:
y_pred=logisticregression.predict(theta, X_test)

In [253]:
print('Correct #:',np.sum(y_test==y_pred) ,'Accuracy:', np.sum(y_test==y_pred)/y_test.shape[0])

Correct #: 2064 Accuracy: 0.9335142469470827


In [259]:
print('\n',classification_report(y_true=y_test, y_pred=y_pred))


               precision    recall  f1-score   support

           0       0.93      0.92      0.92       980
           1       0.93      0.95      0.94      1231

    accuracy                           0.93      2211
   macro avg       0.93      0.93      0.93      2211
weighted avg       0.93      0.93      0.93      2211

