In [112]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import r2_score
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import GridSearchCV

In [113]:
data = fetch_california_housing()
df = pd.DataFrame(data.data)

In [114]:
df.columns = data.feature_names

In [115]:
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [116]:
x = df.iloc[:,0:8]
y = df.iloc[:,-1]

In [117]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [118]:
rt = DecisionTreeRegressor(criterion='squared_error',max_depth=5)

In [119]:
rt.fit(x_train,y_train)

In [120]:
y_pred = rt.predict(x_test)

In [121]:
r2_score(y_test,y_pred)

0.9986908480396145

## HYPERPARAMETER TUNING

In [122]:
param_grid = {
    'max_depth': [2, 4, 8, 10, None],
    'criterion': ['squared_error', 'friedman_mse'], 
    'max_features': ['sqrt', 'log2', None],  
    'min_samples_split': [2, 5, 10],
}


In [123]:
reg = GridSearchCV(DecisionTreeRegressor(),param_grid=param_grid)

In [124]:
reg.fit(x_train,y_train)

In [125]:
reg.best_score_

0.9999964033496122

In [126]:
reg.best_params_

{'criterion': 'squared_error',
 'max_depth': None,
 'max_features': None,
 'min_samples_split': 5}

## FEATURE IMPORTANCE

- feature importamce is used to find the most important features
- (feature_importances_) attribute helps in calculating importance value for the columns in the dataset
-  by watching how many times a column is used in the decision tree
- In case we perform dimensionality reduction we can reduce the number of features

In [127]:
for importance , name in sorted(zip(rt.feature_importances_,x_train.columns),reverse=True):
    print(name,importance)

Longitude 1.0
Population 0.0
MedInc 0.0
Latitude 0.0
HouseAge 0.0
AveRooms 0.0
AveOccup 0.0
AveBedrms 0.0
