In [18]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing

# Fetching a regression dataset
data = fetch_california_housing()

In [5]:
# Pulling data and columns to create training dataframe
X = data['data']
colnames = data['feature_names']
y = data['target']

In [7]:
colnames

['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude']

In [11]:
# Creating pandas dataframe 
df = pd.DataFrame(X, columns=colnames)
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [20]:
# Lets create another feature 
df['MedInc_sqrt'] = df.loc[:,'MedInc'].apply(np.sqrt)
df_corr = df.corr()
df_corr

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedInc_sqrt
MedInc,1.0,-0.119034,0.326895,-0.06204,0.004834,0.018766,-0.079809,-0.015176,0.984329
HouseAge,-0.119034,1.0,-0.153277,-0.077747,-0.296244,0.013191,0.011173,-0.108197,-0.132797
AveRooms,0.326895,-0.153277,1.0,0.847621,-0.072213,-0.004852,0.106389,-0.02754,0.326688
AveBedrms,-0.06204,-0.077747,0.847621,1.0,-0.066197,-0.006181,0.069721,0.013344,-0.06691
Population,0.004834,-0.296244,-0.072213,-0.066197,1.0,0.069863,-0.108785,0.099773,0.018415
AveOccup,0.018766,0.013191,-0.004852,-0.006181,0.069863,1.0,0.002366,0.002476,0.015266
Latitude,-0.079809,0.011173,0.106389,0.069721,-0.108785,0.002366,1.0,-0.924664,-0.084303
Longitude,-0.015176,-0.108197,-0.02754,0.013344,0.099773,0.002476,-0.924664,1.0,-0.015569
MedInc_sqrt,0.984329,-0.132797,0.326688,-0.06691,0.018415,0.015266,-0.084303,-0.015569,1.0


In [None]:
# Insights : If there are huge correlation between two feature variable, remove the variable

In [27]:
df_corr.where(np.triu(np.ones(df_corr.shape),k=1).astype(np.bool))

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedInc_sqrt
MedInc,,-0.119034,0.326895,-0.06204,0.004834,0.018766,-0.079809,-0.015176,0.984329
HouseAge,,,-0.153277,-0.077747,-0.296244,0.013191,0.011173,-0.108197,-0.132797
AveRooms,,,,0.847621,-0.072213,-0.004852,0.106389,-0.02754,0.326688
AveBedrms,,,,,-0.066197,-0.006181,0.069721,0.013344,-0.06691
Population,,,,,,0.069863,-0.108785,0.099773,0.018415
AveOccup,,,,,,,0.002366,0.002476,0.015266
Latitude,,,,,,,,-0.924664,-0.084303
Longitude,,,,,,,,,-0.015569
MedInc_sqrt,,,,,,,,,


In [30]:
corr_upper = df_corr.where(np.triu(np.ones(df_corr.shape),k=1).astype(bool))
corr_upper

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedInc_sqrt
MedInc,,-0.119034,0.326895,-0.06204,0.004834,0.018766,-0.079809,-0.015176,0.984329
HouseAge,,,-0.153277,-0.077747,-0.296244,0.013191,0.011173,-0.108197,-0.132797
AveRooms,,,,0.847621,-0.072213,-0.004852,0.106389,-0.02754,0.326688
AveBedrms,,,,,-0.066197,-0.006181,0.069721,0.013344,-0.06691
Population,,,,,,0.069863,-0.108785,0.099773,0.018415
AveOccup,,,,,,,0.002366,0.002476,0.015266
Latitude,,,,,,,,-0.924664,-0.084303
Longitude,,,,,,,,,-0.015569
MedInc_sqrt,,,,,,,,,


In [35]:
# Filtering columns with correlation higher than 0.95
drop_cols = [column for column in corr_upper.columns if any(corr_upper[column] > 0.95)]
drop_cols

['MedInc_sqrt']