# Week 3: Linear Regression 3
Include concepts such as linear regression with forward and backward selection, PCR, and PLSR.

In [1]:
# Standard Libraries
import os
import time
import math
import io
import zipfile
import requests
import kagglehub
from urllib.parse import urlparse
from itertools import chain, combinations
from collections import Counter
import re

# Data Science Libraries
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import matplotlib.ticker as mticker
import seaborn as sns

# Scikit-learn (Machine Learning)
from sklearn.model_selection import (
    train_test_split, 
    cross_val_score, 
    GridSearchCV, 
    RandomizedSearchCV, 
    RepeatedKFold,
    RepeatedStratifiedKFold
)
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.feature_selection import SequentialFeatureSelector as SFS
from sklearn.feature_selection import f_regression, SelectKBest
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, LogisticRegression
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn import decomposition
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score

# Progress Tracking
from tqdm import tqdm

# Language Processing
from textblob import TextBlob
from datetime import datetime, timedelta
import string

# globals
random_state = 42

In [2]:
def sequential_feature_selection(X, y, direction='forward', test_size=0.2, random_state=42, cv=5, scoring='r2'):
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Initialize SFS
    sfs = SFS(
        estimator=LinearRegression(),
        n_features_to_select='auto',
        direction=direction,
        scoring=scoring,
        cv=cv
    )

    # Fit SFS
    sfs.fit(X_train, y_train)

    # Get selected features
    selected_features = X_train.columns[sfs.get_support()]

    # Train final model on selected features
    model = LinearRegression().fit(X_train[selected_features], y_train)

    # Evaluate on test set
    r2_test = r2_score(y_test, model.predict(X_test[selected_features]))

    return list(selected_features), model, r2_test


## Dataset 1: Web Page Phishing Detection

In [3]:
# Load cleaned data
cleaned_webpage = pd.read_csv('cleaned_webpage.csv')
cleaned_webpage.head()

Unnamed: 0,length_url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_qm,nb_and,nb_or,nb_eq,...,domain_in_title,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank,legitimate
0,37,19,0,3,0,0,0,0,0,0,...,0,1,0,45,0,0,1,1,4,1
1,77,23,1,1,0,0,0,0,0,0,...,1,0,0,77,5767,0,0,1,2,0
2,126,50,1,4,1,0,1,2,0,3,...,1,0,0,14,4004,5828815,0,1,0,0
3,18,11,0,2,0,0,0,0,0,0,...,1,0,0,62,0,107721,0,0,3,1
4,55,15,0,2,2,0,0,0,0,0,...,0,1,0,224,8175,8725,0,0,6,1


In [4]:
# Separate features and target
webpage_features = cleaned_webpage.drop(columns=['legitimate'])
webpage_target = cleaned_webpage['legitimate']

In [5]:
# Forward Seelction (A)
features1a, model1a, r2_score1a = sequential_feature_selection(webpage_features, webpage_target, direction='forward')

In [6]:
# Backward Selection (B)
features1b, model1b, r2_score1b = sequential_feature_selection(webpage_features, webpage_target, direction='backward')

In [7]:
print('Summary of Results')
print('-'*100)
print('Forward Selection:')
print(f'Top 5 Features: {features1a[:5]}')
print(f'R2 Score: {r2_score1a:.4f}')
print('\nBackward Selection:')
print(f'Top 5 Features: {features1b[:5]}')
print(f'R2 Score: {r2_score1b:.4f}')
print('-'*100)

Summary of Results
----------------------------------------------------------------------------------------------------
Forward Selection:
Top 5 Features: ['length_url', 'length_hostname', 'ip', 'nb_dots', 'nb_hyphens']
R2 Score: 0.7443

Backward Selection:
Top 5 Features: ['length_url', 'length_hostname', 'ip', 'nb_hyphens', 'nb_qm']
R2 Score: 0.7450
----------------------------------------------------------------------------------------------------


## Dataset 2: Phishing Email Detection

In [8]:
# Load cleaned data
cleaned_email = pd.read_csv('cleaned_phishing_email.csv')
cleaned_email.head()

Unnamed: 0,polarity,subjectivity,unique_words,char_count,word_count,uppercase_ratio,symbol_ratio,kw_urgent,kw_verify,kw_account,kw_click,kw_suspend,kw_update,kw_login,kw_confirm,currency_symbol_count,phone_number_count,Safe Email
0,0.201493,0.514213,110.0,1030.0,230.0,0.0,0.059223,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.009375,0.084375,53.0,479.0,91.0,0.0,0.037578,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.09154,0.515783,102.0,1245.0,305.0,0.0,0.077108,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0.260069,0.690278,73.0,688.0,96.0,0.056686,0.162791,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0
4,0.152579,0.505258,60.0,441.0,91.0,0.0,0.061224,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [9]:
# Separate features and target
email_features = cleaned_email.drop(columns=['Safe Email'])
email_target = cleaned_email['Safe Email']

In [10]:
# Forward Seelction (A)
features2a, model2a, r2_score2a = sequential_feature_selection(email_features, email_target, direction='forward')

In [11]:
# Backward Seelction (B)
features2b, model2b, r2_score2b = sequential_feature_selection(email_features, email_target, direction='backward')

In [12]:
print('Summary of Results')
print('-'*100)
print('Forward Selection:')
print(f'Top 5 Features: {features2a[:5]}')
print(f'R2 Score: {r2_score2a:.4f}')
print('\nBackward Selection:')
print(f'Top 5 Features: {features2b[:5]}')
print(f'R2 Score: {r2_score2b:.4f}')
print('-'*100)

Summary of Results
----------------------------------------------------------------------------------------------------
Forward Selection:
Top 5 Features: ['polarity', 'subjectivity', 'unique_words', 'symbol_ratio', 'kw_account']
R2 Score: -2.2059

Backward Selection:
Top 5 Features: ['polarity', 'subjectivity', 'unique_words', 'symbol_ratio', 'kw_urgent']
R2 Score: -2.2496
----------------------------------------------------------------------------------------------------


## Dataset 3: Credit Card Transactions Fraud Detection Dataset

In [13]:
# Load cleaned data
cleaned_credit_card = pd.read_csv('cleaned_credit_card.csv')
cleaned_credit_card.head()

Unnamed: 0,category,amt,gender,city,state,lat,long,city_pop,job,unix_time,merch_lat,merch_long,age,max_repeats,is_fraud
0,0,2.86,0,0,0,33.9659,-80.9355,333497,0,1371816865,33.986391,-81.200714,57,3,0
1,0,29.84,1,1,1,40.3207,-110.436,302,1,1371816873,39.450498,-109.960431,35,4,0
2,1,41.28,1,2,2,40.6729,-73.5365,34496,2,1371816893,40.49581,-74.196111,54,4,0
3,2,60.05,0,3,3,28.5697,-80.8191,54767,3,1371816915,28.812398,-80.883061,38,4,0
4,3,3.19,0,4,4,44.2529,-85.017,1126,4,1371816917,44.959148,-85.884734,70,3,0


In [14]:
# Separate features and target
credit_features = cleaned_credit_card.drop(columns=['is_fraud'])
credit_target = cleaned_credit_card['is_fraud']

In [15]:
# Forward Seelction (A)
features3a, model3a, r2_score3a = sequential_feature_selection(credit_features, credit_target, direction='forward')

In [16]:
# Backward Seelction (B)
features3b, model3b, r2_score3b = sequential_feature_selection(credit_features, credit_target, direction='backward')

In [17]:
print('Summary of Results')
print('-'*100)
print('Forward Selection:')
print(f'Top 5 Features: {features3a[:5]}')
print(f'R2 Score: {r2_score3a:.4f}')
print('\nBackward Selection:')
print(f'Top 5 Features: {features3b[:5]}')
print(f'R2 Score: {r2_score3b:.4f}')
print('-'*100)

Summary of Results
----------------------------------------------------------------------------------------------------
Forward Selection:
Top 5 Features: ['category', 'amt', 'gender', 'city', 'city_pop']
R2 Score: 0.0400

Backward Selection:
Top 5 Features: ['category', 'amt', 'gender', 'city', 'city_pop']
R2 Score: 0.0400
----------------------------------------------------------------------------------------------------
