In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [13]:
from sklearn.datasets import fetch_california_housing
dataset = fetch_california_housing()

In [4]:
print(dataset.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block group
        - HouseAge      median house age in block group
        - AveRooms      average number of rooms per household
        - AveBedrms     average number of bedrooms per household
        - Population    block group population
        - AveOccup      average number of household members
        - Latitude      block group latitude
        - Longitude     block group longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived

In [25]:
df=pd.DataFrame(dataset.data, columns=dataset.feature_names)

In [26]:
df['Target']=dataset.target

In [27]:
df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Target
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [28]:
df = df.sample(frac=0.25)
x=df.iloc[:,:-1]
y=df.iloc[:,-1]

In [29]:
df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Target
11138,4.0061,30.0,5.324153,1.008475,1611.0,3.413136,33.85,-117.97,1.829
409,10.0825,52.0,8.209016,1.024590,658.0,2.696721,37.90,-122.28,3.970
10648,2.2969,29.0,4.191740,1.362832,520.0,1.533923,33.54,-117.78,4.500
19914,3.0972,34.0,6.452915,1.134529,607.0,2.721973,36.31,-119.29,0.828
17580,2.8981,26.0,3.099432,1.000000,1638.0,2.326705,37.31,-121.93,2.298
...,...,...,...,...,...,...,...,...,...
1474,2.9861,26.0,4.310646,1.092496,1174.0,2.048866,37.97,-122.04,1.609
11049,3.4234,46.0,5.494048,1.139881,867.0,2.580357,33.79,-117.85,2.000
3862,4.0391,33.0,5.196141,1.035370,580.0,1.864952,34.16,-118.44,3.375
2283,2.1544,26.0,4.402410,1.008434,1976.0,2.380723,36.79,-119.76,0.728


In [30]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.33, random_state=42)

In [31]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor()

In [32]:
regressor.fit(x_train,y_train)

DecisionTreeRegressor()

In [33]:
y_pred = regressor.predict(x_test)

In [34]:
df.shape

(5160, 9)

In [35]:
from sklearn.metrics import r2_score
print(r2_score(y_pred,y_test))

0.5072710687445583


# Hyperparameter Tuning

In [36]:
parameter={
 'criterion':['squared_error','friedman_mse','absolute_error','poisson'],
  'splitter':['best','random'],
  'max_depth':[1,2,3,4,5,6,7,8,10,11,12],
  'max_features':['auto', 'sqrt', 'log2']   
}

In [57]:
regressor = DecisionTreeRegressor()

In [58]:
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(regressor, param_grid=parameter, cv=2, scoring='neg_mean_squared_error')

In [59]:
clf.fit(x_train,y_train)

GridSearchCV(cv=2, estimator=DecisionTreeRegressor(),
             param_grid={'criterion': ['squared_error', 'friedman_mse',
                                       'absolute_error', 'poisson'],
                         'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'splitter': ['best', 'random']},
             scoring='neg_mean_squared_error')

In [60]:
clf.best_params_

{'criterion': 'squared_error',
 'max_depth': 11,
 'max_features': 'auto',
 'splitter': 'random'}

In [85]:
regressor=DecisionTreeRegressor(criterion= 'squared_error', max_depth=10, max_features= 'auto', splitter= 'best')

In [86]:
regressor.fit(x_train,y_train)

DecisionTreeRegressor(max_depth=10, max_features='auto')

In [87]:
y_pred = regressor.predict(x_test)
print(r2_score(y_pred,y_test))

0.5277894880914966
