# Stepwise Regression

## problem definition
- select the important features from set of features available in the data set

### import required packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### load the data

In [2]:
df = pd.read_csv('Advertising.csv')
df.head()

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


In [19]:
df.isna().sum()

TV           0
radio        0
newspaper    0
sales        0
dtype: int64

### pre-processing

In [3]:
# split the data into x and y
x = df.drop('sales', axis=1)
y = df['sales']

### forward selection method

- in this method, we will start with empty model and keep adding features one by one 
- we will make sure if the feature is required using test criterion (mae)

In [33]:
def stepwise_regression(x, y):
    # get the LinearRegression class
    from sklearn.linear_model import LinearRegression
    from sklearn.metrics import mean_absolute_error
    
    # get the columns from x
    columns = x.columns
    
    # create a list of selected columns
    selected_columns = []
    
    # create an empty data set
    new_x = pd.DataFrame()
    
    # remember the last mae
    last_mae = 0
    
    for column in columns:
        # create a temporary df for model building
        tmp_x = pd.DataFrame(x[column])
        for tmp_column in selected_columns:
            tmp_x[tmp_column] = x[tmp_column]
    
        # create a model using tmp_x and y
        tmp_model = LinearRegression()
        tmp_model.fit(tmp_x, y)
    
        # get the p-value (value of test criterion)
        y_pred = tmp_model.predict(tmp_x)
        y_true = y
        mae = mean_absolute_error(y_true, y_pred)
    
        print(f"last_mae = {last_mae}, mae = {mae}")
    
        # check if this is the first feature
        if last_mae == 0:
            selected_columns.append(column)
        else:
            if last_mae > mae:
                selected_columns.append(column)
                last_mae = mae
            else:
                print(f"since current mae is >= last_mase, {column} is not important")
        
    
    print(selected_columns)

In [26]:
stepwise_regression(x, y)

last_mae = 0, mae = 2.549806038927486
last_mae = 2.549806038927486, mae = 1.2537471644234006
last_mae = 1.2537471644234006, mae = 1.2520112296870682
['TV', 'radio', 'newspaper']


## Stepwise regression for fortune 1000 companies

### load the data

In [27]:
df = pd.read_csv('fortune1000.csv')
df.head()

Unnamed: 0,Rank,Company,Sector,Industry,Location,Revenue,Profits,Employees
0,1,Walmart,Retailing,General Merchandisers,"Bentonville, AR",482130,14694,2300000
1,2,Exxon Mobil,Energy,Petroleum Refining,"Irving, TX",246204,16150,75600
2,3,Apple,Technology,"Computers, Office Equipment","Cupertino, CA",233715,53394,110000
3,4,Berkshire Hathaway,Financials,Insurance: Property and Casualty (Stock),"Omaha, NE",210821,24083,331000
4,5,McKesson,Health Care,Wholesalers: Health Care,"San Francisco, CA",181241,1476,70400


### pre-processing 

In [28]:
# since the company and Rank columns are uniques values, drop them
df.drop(['Rank', 'Company'], axis=1, inplace=True)

In [29]:
# convert all qualitative columns to quantitative ones
from sklearn.preprocessing import LabelEncoder

# create encoders
df['Sector'] = LabelEncoder().fit_transform(df['Sector'])
df['Industry'] = LabelEncoder().fit_transform(df['Industry'])
df['Location'] = LabelEncoder().fit_transform(df['Location'])

In [30]:
df.head()

Unnamed: 0,Sector,Industry,Location,Revenue,Profits,Employees
0,16,26,23,482130,14694,2300000
1,4,50,174,246204,16150,75600
2,17,11,92,233715,53394,110000
3,6,39,279,210821,24083,331000
4,9,72,327,181241,1476,70400


In [31]:
# split the data into x and y
x = df.drop('Profits', axis=1)
y = df['Profits']

In [34]:
stepwise_regression(x, y)

last_mae = 0, mae = 1410.4435612245827
last_mae = 0, mae = 1443.4148214849354
last_mae = 0, mae = 1444.8216464327206
last_mae = 0, mae = 1039.4352051670244
last_mae = 0, mae = 1081.5705733681684
['Sector', 'Industry', 'Location', 'Revenue', 'Employees']
