Link to the competition: [Techgig-HPE Competition](https://www.techgig.com/codegladiators/machine-learning)
### Task:

The candidates have to read the data and create a model based on the data analysis to identify if the website is legitimate or a phishing website. The Result will be determined by the two values [1, -1] where 1 represent the legitimate and -1 represents phishing.

### Data Description:

The data set consists of 30 features of a phishing website. The value of attributes can be [1, 0, -1] except the key value which is incremental.

1 represents the legitimate

0 represents suspicious

-1 represents phishing

In [1]:
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost.sklearn import XGBClassifier


from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import f1_score, roc_auc_score, recall_score, precision_score, balanced_accuracy_score, log_loss, confusion_matrix,classification_report

In [3]:
trainData = pd.read_csv('/content/drive/MyDrive/HPE Hackathon/Phising_Training_Dataset.csv')
print(trainData.head(3))
submitData = pd.read_csv('/content/drive/MyDrive/HPE Hackathon/Phising_Testing_Dataset.csv') 
print(submitData.head(3))

     key  having_IP  URL_Length  Shortining_Service  having_At_Symbol  \
0  12344         -1           1                   1                 1   
1  12345          1           1                   1                 1   
2  12346          1           0                   1                 1   

   double_slash_redirecting  Prefix_Suffix  having_Sub_Domain  SSLfinal_State  \
0                        -1             -1                 -1              -1   
1                         1             -1                  0               1   
2                         1             -1                 -1              -1   

   Domain_registeration_length  ...  popUpWidnow  Iframe  age_of_domain  \
0                           -1  ...            1       1             -1   
1                           -1  ...            1       1             -1   
2                           -1  ...            1       1              1   

   DNSRecord  web_traffic  Page_Rank  Google_Index  Links_pointing_to_page  \
0  

In [15]:
trainData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8955 entries, 0 to 8954
Data columns (total 32 columns):
 #   Column                       Non-Null Count  Dtype
---  ------                       --------------  -----
 0   key                          8955 non-null   int64
 1   having_IP                    8955 non-null   int64
 2   URL_Length                   8955 non-null   int64
 3   Shortining_Service           8955 non-null   int64
 4   having_At_Symbol             8955 non-null   int64
 5   double_slash_redirecting     8955 non-null   int64
 6   Prefix_Suffix                8955 non-null   int64
 7   having_Sub_Domain            8955 non-null   int64
 8   SSLfinal_State               8955 non-null   int64
 9   Domain_registeration_length  8955 non-null   int64
 10  Favicon                      8955 non-null   int64
 11  port                         8955 non-null   int64
 12  HTTPS_token                  8955 non-null   int64
 13  Request_URL                  8955 non-null   int

In [None]:
submitData.info()

In [None]:
print("Training Data:")
for col in trainData.columns[1:]:
    print(trainData[col].value_counts())

In [None]:
#trainData = trainData.sample(frac=1).reset_index(drop=True)

In [None]:
trainData.head(3)

In [4]:
y= trainData['Result']
X = trainData.drop(['Result','key'],axis=1)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y,test_size=0.20)

Multiple Models

In [6]:
values=[]
models = [RandomForestClassifier(random_state=42),LogisticRegression(),DecisionTreeClassifier(random_state=42),SVC(random_state=42),KNeighborsClassifier(),XGBClassifier()]
for m in models:
  m.fit(X_train,y_train)
  y_pred=m.predict(X_test)
  print(m)
  print(classification_report(y_test,y_pred)[1])
  print(confusion_matrix(y_test,y_pred))
  values.append([str(m)[:10],f1_score(y_test,y_pred), roc_auc_score(y_test,y_pred), recall_score(y_test,y_pred), precision_score(y_test,y_pred), 
      balanced_accuracy_score(y_test,y_pred), log_loss(y_test,y_pred)])
  print('==========================================================')

RandomForestClassifier(random_state=42)
 
[[757  27]
 [ 19 988]]
LogisticRegression()
 
[[716  68]
 [ 53 954]]
DecisionTreeClassifier(random_state=42)
 
[[753  31]
 [ 47 960]]
SVC(random_state=42)
 
[[726  58]
 [ 33 974]]
KNeighborsClassifier()
 
[[736  48]
 [ 43 964]]
XGBClassifier()
 
[[729  55]
 [ 37 970]]


In [None]:
values.insert(0,['Model','f1_score','roc_auc_score','recall_score','precision_score','balanced_accuracy_score','log_loss'])
results= pd.DataFrame(values[1:],columns=values[0])

In [None]:
results[['Model','f1_score','roc_auc_score','recall_score']]

Unnamed: 0,Model,f1_score,roc_auc_score,recall_score
0,RandomFore,0.969444,0.968384,0.976036
1,LogisticRe,0.917414,0.914291,0.928109
2,DecisionTr,0.958842,0.957447,0.965674
3,SVC(),0.942793,0.940418,0.955311
4,KNeighbors,0.944591,0.944773,0.927461
5,XGBClassif,0.947807,0.945761,0.958549


In [None]:
results[['precision_score','balanced_accuracy_score','log_loss']]

Unnamed: 0,precision_score,balanced_accuracy_score,log_loss
0,0.962939,0.968384,1.08614
1,0.906962,0.914291,2.949726
2,0.952107,0.957447,1.46343
3,0.930599,0.940418,2.046517
4,0.962366,0.944773,1.920741
5,0.937302,0.945761,1.863588


XGBoost Implementation 

In [None]:
xgb_cl = xgb.XGBClassifier()
xgb_cl.fit(X_train, y_train)
y_preds = xgb_cl.predict(X_test)
print("Initial Training for XGBoost:",accuracy_score(y_test, y_preds))
param_grid_xgb = {
    "max_depth": [7,9,10,12,14],
    "learning_rate": [0.1,0.09,0.08],
    "gamma": [0.1, 0.25, 5],
    "reg_lambda": [0, 1, 10],
    "scale_pos_weight": [1, 3, 5],
    "subsample": [0.8],
    "colsample_bytree": [0.5],
}

0.9455113890129522

In [None]:
grid_cv = GridSearchCV(xgb_cl, param_grid, n_jobs=-1, cv=3, scoring="roc_auc")
_ = grid_cv.fit(X_train, y_train)
print(grid_cv.best_params_)
print(grid_cv.best_score_)
xgCl = xgb.XGBClassifier(**grid_cv.best_params_,)
xgCl.fit(X_train, y_train)
y_preds = xgCl.predict(X_test)
confusion_matrix(y_test,y_preds)

Classification using SVM

In [None]:
param_grid_svc = {
    'C': [100,120,140,160,180,200],
    'gamma': ['scale', 'auto'],
    'kernel' : ['linear', 'poly', 'rbf', 'sigmoid']}

grid_cv = GridSearchCV(models[3], param_grid_svc,cv=5,verbose=3)
_ = grid_cv.fit(X_train, y_train)

In [8]:
print(grid_cv.best_params_)
print(grid_cv.best_score_)
svCl = SVC(**grid_cv.best_params_,random_state=42)
svCl.fit(X_train, y_train)
y_preds = svCl.predict(X_test)
confusion_matrix(y_test,y_preds)

{'C': 140, 'gamma': 'auto', 'kernel': 'rbf'}
0.957285766080458


array([[749,  35],
       [ 29, 978]])

Random Forests

In [13]:
param_grid_rf = {
    'n_estimators': [100,120,140,160,180,200],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [25,30,35,40],
    'criterion' :['gini']}

grid_cv = GridSearchCV(models[0], param_grid_rf,cv=5)
_ = grid_cv.fit(X_train, y_train)

In [14]:
print(grid_cv.best_params_)
print(grid_cv.best_score_)
rfCl = RandomForestClassifier(**grid_cv.best_params_,random_state=42)
rfCl.fit(X_train, y_train)
y_preds = rfCl.predict(X_test)
confusion_matrix(y_test,y_preds)

{'criterion': 'gini', 'max_depth': 35, 'max_features': 'sqrt', 'n_estimators': 100}
0.9669183492068442


array([[742,  42],
       [ 12, 995]])

Decision trees GridSearch

In [11]:
param_grid_dtc = {
    'criterion' : ['gini', 'entropy'],
    'splitter' : ['best', 'random'],
    'max_depth' : [15,17,20,23,27,30]}

grid_cv = GridSearchCV(models[2], param_grid_dtc,cv=5)
_ = grid_cv.fit(X_train, y_train)

In [12]:
print(grid_cv.best_params_)
print(grid_cv.best_score_)
dtCl = DecisionTreeClassifier(**grid_cv.best_params_,random_state=42)
dtCl.fit(X_train, y_train)
y_preds = dtCl.predict(X_test)
confusion_matrix(y_test,y_preds)

{'criterion': 'entropy', 'max_depth': 27, 'splitter': 'random'}
0.9565876369845657


array([[738,  46],
       [ 23, 984]])

Writing results to csv file

In [None]:
finalSubmission = pd.DataFrame(columns=['key','Result'])
finalSubmission['key'] = submitData['key']
X_prediction = submitData.drop('key',axis=1)
y_prediction = model_DTC.predict(X_prediction)
finalSubmission['Result'] = y_prediction
finalSubmission.to_csv('/content/drive/MyDrive/HPE Hackathon/submissionFile.csv',index=False)