In [None]:
from google.colab import data_table
data_table.enable_dataframe_formatter()

In [None]:
cd '/content/drive/MyDrive/Data Analytics/30 EDA - Feature Selection'

/content/drive/MyDrive/Data Analytics/30 EDA - Feature Selection


# Missing Value Filter

Problem statement
You have been provided with a "startup" dataset. In this dataset, the information present is regarding the spending on various departments by newly founded startups and the profit they are generating.

You need to evaluate this dataset and find out which features should be removed after using the missing value ratio technique.

Note: The startup dataset is provided as a part of this question.

Expeceted Output

Print the list of features that should be removed.

In [None]:
import pandas as pd

df = pd.read_csv("startup.csv")

missing = (df.isnull().sum()/len(df) * 100) > 40

print(list(missing[missing].index))

['Advertisment']


# Variance Filter

Problem statement
A new feature is proposed to add to the big mart dataset called "Item_Importance". This feature is added to find out the if there is a relationship between the weight and visibility of an item. The description of this feature is as follows:

“Item_Importance”: This feature helps in finding the importance of an item.by using a combination of “ItemWeight” and “ItemVisibility”. The formula used to create this feature is

Item_Importance = (Item_Visibility*100)/Item_Weight
Your task is to firstly create this feature and then find out if this feature should be selected or not by using the low variance filter technique.

Expected Output

Print the value of variance for Item_Importance rounded to 6 decimal places after performing standardization.

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

df = pd.read_csv("bigmart.csv")

df['Item_Importance'] = (df.Item_Visibility *100)/df.Item_Weight

#Standardizing
#StandardScaler takes in columnar data; hence df['Item_Importance'] has to be reshaped to columnar form
df['Item_Importance'] = StandardScaler().fit_transform(np.array(df['Item_Importance']).reshape(-1,1))

print(round(df.Item_Importance.var(), 6))

1.000142


In [None]:
numeric = df.var(numeric_only = True).index

df_scaled = df.copy()

df_scaled[numeric] = StandardScaler().fit_transform(df_scaled[numeric])

df_scaled.var(numeric_only = True)

Item_Weight                  1.000142
Item_Visibility              1.000117
Item_MRP                     1.000117
Outlet_Establishment_Year    1.000117
Item_Outlet_Sales            1.000117
Item_Importance              1.000142
dtype: float64

# Correlation Filter

Problem statement
In the startup dataset, you need to find the group of features which are highly correlated with each other. Features whose correlation is greater than 0.8 needs to be selected.

Startup dataset is provided as a part of this question.

Expected Output

Print the pair of features whose correlation is greater than 0.8.
Output Format

Print a list of each pair correlated feature in the format [[Feature_1, Feature_1, correlation rounded to 3 decimal places], ...]
...
Note: Do not print the correlation of a feature with itself.

In [None]:
#duplicate pairs also have to be printed

import pandas as pd

df = pd.read_csv("startup.csv")

n = len(df.columns)

corr = df.corr()

corr_list = []

for i in range(n):
  for j in range(n):
    if (abs(corr.iloc[i, j]) > 0.8) and (i != j):
      corr_list.append([df.columns[i], df.columns[j], round(corr.iloc[i, j], 3)])

print(corr_list)

[['R&D Spend', 'Advertisment', 0.913], ['R&D Spend', 'Profit', 0.973], ['Advertisment', 'R&D Spend', 0.913], ['Advertisment', 'Marketing Spend', 1.0], ['Advertisment', 'Profit', 0.901], ['Marketing Spend', 'Advertisment', 1.0], ['Profit', 'R&D Spend', 0.973], ['Profit', 'Advertisment', 0.901]]


In [None]:
df.corr()

Unnamed: 0,R&D Spend,Administration,Advertisment,Marketing Spend,Profit
R&D Spend,1.0,0.241955,0.913209,0.724248,0.9729
Administration,0.241955,1.0,0.070372,-0.032154,0.200717
Advertisment,0.913209,0.070372,1.0,1.0,0.901407
Marketing Spend,0.724248,-0.032154,1.0,1.0,0.747766
Profit,0.9729,0.200717,0.901407,0.747766,1.0


# Backward Selection

Problem statement
Your task is to check all the numerical features present in the “Startup” dataset and figure out which feature can be eliminated using the “Backward Feature Selection” technique.

Startup dataset is provided as a part of this question.

Expected Output

Print the support for each column in a list

In [None]:
import pandas as pd
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

df = pd.read_csv('startup.csv')
df.dropna(inplace = True)

#extracting all numeric columns except the target column
cols = df.select_dtypes(include = 'number').columns[:-1].tolist()

lre = LinearRegression()
rfe = RFE(estimator = lre, n_features_to_select= 3, step = 1)
rfe.fit(df[cols], df['Profit'])

print(rfe.support_.tolist())

[True, True, True, False]


In [None]:
df.head()

Unnamed: 0,R&D Spend,Administration,Advertisment,Marketing Spend,Profit
0,165349.2,136897.8,471784.1,471784.1,192261.83
1,162597.7,151377.59,443898.53,443898.53,191792.06
2,153441.51,101145.55,407934.54,407934.54,191050.39
3,144372.41,118671.85,383199.62,383199.62,182901.99
4,142107.34,91391.77,366168.42,366168.42,166187.94


# Forward Selection

Problem statement
Your task is to check all the numerical features present in the “Startup” dataset and figure out which feature can be eliminated using the “Forward Feature Selection” technique.

Startup dataset is provided as a part of this question.

Expected Output

Print the column names that need to be selected after Forward selection.

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import f_regression

df = pd.read_csv("startup.csv")

#droping NaN
df.dropna(inplace = True)

#list of numberic columns other than response variable
cols = df.select_dtypes(include = 'number').columns.tolist()[:-1]

#forward regression
fr = f_regression(df[cols], df.Profit)

#numpy array containing p-values
p = np.array(fr[1])

#pandas series containing numeric columns other than response variable, for easier indexing with numpy array 'p'
cols_series = df.select_dtypes(include = 'number').columns[:-1]

#printing features with p-value less than significance levels
print(cols_series[p < 0.05].tolist())

['R&D Spend', 'Advertisment', 'Marketing Spend']


# Create Feature

Problem statement
You have been asked to find out the cost of a product if GST is added to it. For a single product, the GST amount is 18% of the Item_MRP. You need to create a new feature called “Item_MRP_GST” and fill in the values for each row.

Expected Output

Print the mean of the new feature.

In [None]:
#mean to be printed should be rounded to 2 decimal places

import pandas as pd

df = pd.read_csv("bigmart.csv")

df['Item_MRP_GST'] = df.Item_MRP * 1.18

mean = df.Item_MRP_GST.mean()

print(round(mean, 2))

166.37


# Feature Selection on Cars Dataset

## Feature Removal

Problem statement
In the cars dataset, you need to perform a low variance filter on the scaled dataset to select the most relevant features. After filtering, remove the features that do not pass the low variance threshold.

Expected Output

Print the features that need to be removed.

In [None]:
#output is 'No Output', so don't print anything

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold

df = pd.read_csv("cars.csv")

cols = df.select_dtypes(include = 'number').columns[:-1]

df[cols] = StandardScaler().fit_transform(df[cols])

df[cols].var()

rowno               1.000084
Year                1.000084
Engine_HP           1.000084
Engine_Cylinders    1.000084
Number_of_Doors     1.000084
highway_MPG         1.000084
city_mpg            1.000084
Popularity          1.000084
dtype: float64

In [None]:
#correct output is no output; therefore don't print anything

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold

df = pd.read_csv("cars.csv")

cols = df.columns[[0, 3,5,6, 9, 13, 14,15]]

df[cols] = StandardScaler().fit_transform(df[cols])

df[cols].var()

rowno               1.000084
Year                1.000084
Engine_HP           1.000084
Engine_Cylinders    1.000084
Number_of_Doors     1.000084
highway_MPG         1.000084
city_mpg            1.000084
Popularity          1.000084
dtype: float64

In [None]:
df.head()

Unnamed: 0,rowno,Make,Model,Year,Engine_Fuel_Type,Engine_HP,Engine_Cylinders,Transmission_Type,Driven_Wheels,Number_of_Doors,Market_Category,Vehicle_Size,Vehicle_Style,highway_MPG,city_mpg,Popularity,MSRP
0,0,Infiniti,QX56,2011,premium unleaded (recommended),400.0,8.0,AUTOMATIC,four wheel drive,4,Luxury,Large,4dr SUV,20,14,190,61800
1,1,Chevrolet,Cavalier,2005,regular unleaded,140.0,4.0,MANUAL,front wheel drive,2,,Compact,Coupe,33,23,1385,17510
2,2,Toyota,Tundra,2016,regular unleaded,381.0,8.0,AUTOMATIC,rear wheel drive,4,,Large,Extended Cab Pickup,18,13,2031,38670
3,3,Volkswagen,Jetta,2015,regular unleaded,170.0,4.0,AUTOMATIC,front wheel drive,4,,Midsize,Sedan,37,25,873,23650
4,4,Infiniti,Q50,2017,premium unleaded (required),400.0,6.0,AUTOMATIC,all wheel drive,4,"Factory Tuner,Luxury,High-Performance",Midsize,Sedan,26,19,190,50700


In [None]:
df = pd.read_csv("cars.csv")
df.columns[[0, 3,5,6, 8, 9, 11, 13, 14]]

Index(['rowno', 'Year', 'Engine_HP', 'Engine_Cylinders', 'Driven_Wheels',
       'Number_of_Doors', 'Vehicle_Size', 'highway_MPG', 'city_mpg'],
      dtype='object')

## Correlation Filtering

Problem statement
After selecting the features using the low filtering technique, you need to perform high correlation filtering on the remaining features.

Expected Output

Print the features that need to be removed.

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv("cars.csv")

df_num = df.select_dtypes(include = 'number').columns

corr = np.abs(df.corr(numeric_only = True).MSRP)

In [None]:
df.columns
df_num
corr

rowno              -0.001632
Year                0.227536
Engine_HP           0.659064
Engine_Cylinders    0.529165
Number_of_Doors    -0.126031
highway_MPG        -0.160063
city_mpg           -0.157708
Popularity         -0.048549
MSRP                1.000000
Name: MSRP, dtype: float64

## Bidirectional feature selection

Problem statement
After performing basic filtering techniques on the cars dataset, you need to verify if any other feature is there which is not relevant using the forward and backward selection techniques.

After performing a forward and backward selection of features, you need to remove the features which are above or below a significance level.

Note: When forward and backward feature selection is performed together, then it is called the Bidirectional feature selection.

Expected Output

Print the features that need to be removed.

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import RFE, f_regression
from sklearn.linear_model import LinearRegression

df = pd.read_csv("cars.csv")

#selecting numeric column names, except response variable
cols = df.select_dtypes(include = 'number').columns[:-1]

#backward selection
lre = LinearRegression()
rfe = RFE(estimator = lre, n_features_to_select = 4, step = 1)
rfe.fit(df[cols], df['MSRP'])

cols[~rfe.support_]

#forward selection
fr = f_regression(df[cols], df['MSRP'])
p = np.array(fr[1])

cols[p < 0.05]

Index(['Year', 'Engine_HP', 'Engine_Cylinders', 'Number_of_Doors',
       'highway_MPG', 'city_mpg', 'Popularity'],
      dtype='object')

In [None]:
cols = df.select_dtypes(include = 'number').columns[:-1]

Index(['rowno', 'Year', 'Engine_HP', 'Engine_Cylinders', 'Number_of_Doors',
       'highway_MPG', 'city_mpg', 'Popularity'],
      dtype='object')