In [10]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFE

In [4]:
train_df = pd.read_csv('../final_data/Phishing_train_dataset.csv')
test_df = pd.read_csv('../final_data/Phishing_test_dataset.csv')

# 1 - Phishing
# 0 - legitimate
# 2 - suspicious

In [5]:
X_train, y_train = train_df.drop('Result', axis=1), train_df.Result
X_test, y_test = test_df.drop('Result', axis=1), test_df.Result

### Scartch Model

In [6]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

In [7]:
lr.score(X_test, y_test)

0.7654723127035831

### With hyperparameter tuning

In [8]:
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga'],
    'max_iter': [100, 500, 1000]
}

lr = LogisticRegression()

cv = GridSearchCV(lr,
                  param_grid=param_grid,
                  cv=5,
                  scoring='accuracy')

cv.fit(X_train, y_train)

print(f"Best Score: {cv.best_score_}, Best Params: {cv.best_params_}")

best_model = cv.best_estimator_

best_model.fit(X_train, y_train)

225 fits failed out of a total of 750.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
75 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Acer\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Acer\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\Users\Acer\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.

--------------------------

Best Score: 0.7618172180692222, Best Params: {'C': 1, 'max_iter': 100, 'penalty': 'l1', 'solver': 'saga'}


In [9]:
best_model.score(X_test, y_test)

0.7654723127035831

### Recursive Feature Elimination

In [38]:
lr = LogisticRegression(**cv.best_params_)

rfe = RFE(lr, n_features_to_select=10)
rfe.fit(X_train, y_train)

X_train_rfe = rfe.transform(X_train)
X_test_rfe = rfe.transform(X_test)

print(rfe.ranking_)
print(rfe.support_)

[1 1 1 1 1 1 1 1 1 1]
[ True  True  True  True  True  True  True  True  True  True]


In [39]:
lr.fit(X_train_rfe, y_train)

In [40]:
lr.score(X_test_rfe, y_test)

0.7654723127035831