<a href="https://colab.research.google.com/github/shila121/projects/blob/main/a_dimensional_reduction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# loading the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [2]:
# loading the data 
data = pd.read_csv('missing_value_ratio-200306-192816.csv')
print(data.head())
print(data.shape)

      ID  season  holiday  workingday  ...   atemp  humidity  windspeed  count
0  AB101     1.0      0.0         0.0  ...  14.395      81.0        NaN     16
1  AB102     1.0      NaN         0.0  ...  13.635      80.0        NaN     40
2  AB103     1.0      0.0         NaN  ...  13.635      80.0        NaN     32
3  AB104     NaN      0.0         NaN  ...  14.395      75.0        NaN     13
4  AB105     1.0      NaN         0.0  ...  14.395       NaN    16.9979      1

[5 rows x 10 columns]
(12980, 10)


# 1.Missing Value Ratio for Dimensionality Reduction

In [3]:
print(data.isnull().sum())

ID               0
season           9
holiday       6295
workingday       9
weather          4
temp             0
atemp            0
humidity         5
windspeed     5324
count            0
dtype: int64


In [4]:
print(data.isnull().mean()*100)

ID             0.000000
season         0.069337
holiday       48.497689
workingday     0.069337
weather        0.030817
temp           0.000000
atemp          0.000000
humidity       0.038521
windspeed     41.016949
count          0.000000
dtype: float64


In [5]:
var_to_be_dropped = [var for var in data.columns if data[var].isnull().mean()*100 > 40]
var_selected = [var for var in data.columns if var not in var_to_be_dropped]
print(var_to_be_dropped)
print(var_selected)

['holiday', 'windspeed']
['ID', 'season', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'count']


In [6]:
print("Original data shape:",data.shape)
print("New data shape:",data[var_selected].shape)

# we observate that the dimension is reduced of data has  been reduced

Original data shape: (12980, 10)
New data shape: (12980, 8)


# 2. Variance calculation for Dimensionality Reduction

In [7]:
# load the data
data = pd.read_csv('low_variance_filter-200306-194411.csv')
print(data.head())
print(data.shape)

      ID  temp   atemp  humidity  windspeed  count
0  AB101  9.84  14.395        81        0.0     16
1  AB102  9.02  13.635        80        0.0     40
2  AB103  9.02  13.635        80        0.0     32
3  AB104  9.84  14.395        75        0.0     13
4  AB105  9.84  14.395        75        0.0      1
(12980, 6)


In [8]:
print(data.isnull().sum())
# no missing values

ID           0
temp         0
atemp        0
humidity     0
windspeed    0
count        0
dtype: int64


In [9]:
data = data.drop('ID',axis = 1)
print(data.shape)

(12980, 5)


In [10]:
# before going for calculation of variance, we will normalize the data
from sklearn.preprocessing import normalize
normalize = normalize(data)

In [11]:
data_scaled = pd.DataFrame(normalize)
data_scaled.var()

0    0.005877
1    0.007977
2    0.093491
3    0.008756
4    0.111977
dtype: float64

In [12]:
variance = data_scaled.var()
columns = data.columns


variance_select = [ var for var in variance if var >=0.006]
print('selected variance:',variance_select)


variable = []
for i in range(len(variance)):
  if variance[i] >= 0.006:
    variable.append(columns[i])

selected variance: [0.007977044945059584, 0.09349125331649057, 0.008755751088686264, 0.11197722261752109]


In [13]:
new_data = data[variable]
print(new_data.head())

    atemp  humidity  windspeed  count
0  14.395        81        0.0     16
1  13.635        80        0.0     40
2  13.635        80        0.0     32
3  14.395        75        0.0     13
4  14.395        75        0.0      1


In [14]:
print("Original data shape:",data.shape)
print("New data shape:",new_data.shape)

# we observate that the dimension is reduced of data has  been reduced

Original data shape: (12980, 5)
New data shape: (12980, 4)


In [15]:
new_data.var()

atemp           73.137484
humidity       398.549141
windspeed       69.322053
count        25843.419864
dtype: float64

# 3. Backward Elimination for Dimensionality Reduction

In [16]:
# load the data

In [30]:
data = pd.read_csv('backward_feature_elimination-200308-140938.csv')
print(data.head())
print(data.shape)

      ID  season  holiday  workingday  ...  temp  humidity  windspeed  count
0  AB101       1        0           0  ...  9.84        81        0.0     16
1  AB102       1        0           0  ...  9.02        80        0.0     40
2  AB103       1        0           0  ...  9.02        80        0.0     32
3  AB104       1        0           0  ...  9.84        75        0.0     13
4  AB105       1        0           0  ...  9.84        75        0.0      1

[5 rows x 9 columns]
(12980, 9)


In [31]:
# check if any missing value are there
data.isnull().sum()
# no missing values observed

ID            0
season        0
holiday       0
workingday    0
weather       0
temp          0
humidity      0
windspeed     0
count         0
dtype: int64

In [32]:
# create train data
X = data.drop(['ID','count'],axis = 1)
y = data['count']

In [33]:
print(X.shape,y.shape)

(12980, 7) (12980,)


In [34]:
!pip install mlxtend




In [35]:
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
from sklearn.linear_model import LinearRegression

lreg = LinearRegression()
sfs1 = sfs(lreg ,k_features=4,forward=False ,verbose =1 ,scoring = 'neg_mean_squared_error')

In [37]:
# sfs1 = sfs1.fit(X,y)

# 4. Forward Selection for Dimensionality Reduction

In [38]:
data = pd.read_csv('forward_feature_selection-200306-234007.csv')
print(data.head())
print(data.shape)

      ID  season  holiday  workingday  ...  temp  humidity  windspeed  count
0  AB101       1        0           0  ...  9.84        81        0.0     16
1  AB102       1        0           0  ...  9.02        80        0.0     40
2  AB103       1        0           0  ...  9.02        80        0.0     32
3  AB104       1        0           0  ...  9.84        75        0.0     13
4  AB105       1        0           0  ...  9.84        75        0.0      1

[5 rows x 9 columns]
(12980, 9)


In [39]:
# check if any missing value are there
data.isnull().sum()
# no missing values observed

ID            0
season        0
holiday       0
workingday    0
weather       0
temp          0
humidity      0
windspeed     0
count         0
dtype: int64

In [40]:
# create train data
X = data.drop(['ID','count'],axis = 1)
y = data['count']

In [41]:
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
from sklearn.linear_model import LinearRegression

lreg = LinearRegression()
sfs1 = sfs(lreg ,k_features=4,forward=False ,verbose =2 ,scoring = 'neg_mean_squared_error')

In [43]:
# sfs1 = sfs1.fit(X,y)