## Import libraries

In [56]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Load data in pandas frame

In [57]:
df = pd.read_csv('/content/drive/My Drive/machine learning/regression/data/housing.csv')
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


## Prepaire the Data for Train the Model

In [58]:
df.ocean_proximity.unique()

array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
      dtype=object)

In [59]:

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [60]:
df.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

here we can see that data is missing

so drop or fill with median value

In [61]:
mean = df.total_bedrooms.mean()

In [62]:
mean

537.8705525375618

In [63]:
df['total_bedrooms'] = df['total_bedrooms'].fillna(mean)

In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20640 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [65]:
df1= df.loc[:,:].values

In [66]:
df1

array([[-122.23, 37.88, 41.0, ..., 8.3252, 452600.0, 'NEAR BAY'],
       [-122.22, 37.86, 21.0, ..., 8.3014, 358500.0, 'NEAR BAY'],
       [-122.24, 37.85, 52.0, ..., 7.2574, 352100.0, 'NEAR BAY'],
       ...,
       [-121.22, 39.43, 17.0, ..., 1.7, 92300.0, 'INLAND'],
       [-121.32, 39.43, 18.0, ..., 1.8672, 84700.0, 'INLAND'],
       [-121.24, 39.37, 16.0, ..., 2.3886, 89400.0, 'INLAND']],
      dtype=object)

In [67]:
df1.shape

(20640, 10)

In [68]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import  ColumnTransformer

In [69]:
ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[9])],remainder='passthrough')

df1 = ct.fit_transform(df1)

In [70]:
df1

array([[0.0, 0.0, 0.0, ..., 126.0, 8.3252, 452600.0],
       [0.0, 0.0, 0.0, ..., 1138.0, 8.3014, 358500.0],
       [0.0, 0.0, 0.0, ..., 177.0, 7.2574, 352100.0],
       ...,
       [0.0, 1.0, 0.0, ..., 433.0, 1.7, 92300.0],
       [0.0, 1.0, 0.0, ..., 349.0, 1.8672, 84700.0],
       [0.0, 1.0, 0.0, ..., 530.0, 2.3886, 89400.0]], dtype=object)

## feature or target

In [71]:
df2 = pd.DataFrame(df1)

In [72]:
df2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0,0,0,1,0,-122.23,37.88,41,880,129,322,126,8.3252,452600
1,0,0,0,1,0,-122.22,37.86,21,7099,1106,2401,1138,8.3014,358500
2,0,0,0,1,0,-122.24,37.85,52,1467,190,496,177,7.2574,352100
3,0,0,0,1,0,-122.25,37.85,52,1274,235,558,219,5.6431,341300
4,0,0,0,1,0,-122.25,37.85,52,1627,280,565,259,3.8462,342200


In [73]:
df2 = df2.drop(columns=[5,6])

df2.head()

Unnamed: 0,0,1,2,3,4,7,8,9,10,11,12,13
0,0,0,0,1,0,41,880,129,322,126,8.3252,452600
1,0,0,0,1,0,21,7099,1106,2401,1138,8.3014,358500
2,0,0,0,1,0,52,1467,190,496,177,7.2574,352100
3,0,0,0,1,0,52,1274,235,558,219,5.6431,341300
4,0,0,0,1,0,52,1627,280,565,259,3.8462,342200


In [74]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 12 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       20640 non-null  object
 1   1       20640 non-null  object
 2   2       20640 non-null  object
 3   3       20640 non-null  object
 4   4       20640 non-null  object
 5   7       20640 non-null  object
 6   8       20640 non-null  object
 7   9       20640 non-null  object
 8   10      20640 non-null  object
 9   11      20640 non-null  object
 10  12      20640 non-null  object
 11  13      20640 non-null  object
dtypes: object(12)
memory usage: 1.9+ MB


In [75]:
X = df2.iloc[:,:-1].values
y = df2.iloc[:,-1].values

## Train or test split the Data

In [76]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 42)

In [77]:
X_train.shape

(16512, 11)

In [78]:
X_test.shape

(4128, 11)

##Scale the data

In [79]:

from sklearn.preprocessing import StandardScaler
std =StandardScaler()


In [80]:
X_train = std.fit_transform(X_train)
X_test = std.transform(X_test)

## Lets train train the model

In [81]:
from sklearn.tree import  DecisionTreeRegressor

In [82]:
tree_reg = DecisionTreeRegressor()
tree_reg.fit(X_train,y_train)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

In [83]:
y_pred = tree_reg.predict(X_test)

In [84]:
from sklearn.metrics import  mean_absolute_error,mean_squared_error,r2_score

In [85]:
print(r2_score(y_test,y_pred))
print(mean_squared_error(y_test,y_pred))
print(mean_absolute_error(y_test,y_pred))

0.4131689417311686
7689886874.660852
59828.82751937985


### improve accuracy of model

In [86]:
tree_reg = DecisionTreeRegressor(max_depth=50000,random_state=42,)
tree_reg.fit(X_train,y_train)
y_pred = tree_reg.predict(X_test)

In [87]:
print(r2_score(y_test,y_pred))
print(mean_squared_error(y_test,y_pred))
print(mean_absolute_error(y_test,y_pred))

0.4211515834717423
7585281620.59157
59859.27858527132


we have no idea about the what ar the best hyperparameter for decision tree model
so we are going to use RandamizedSearchcv

##RandomizedSearchCV

In [88]:
from sklearn.model_selection import RandomizedSearchCV

In [89]:

tree = DecisionTreeRegressor()

In [90]:
max_depth = [i for i in range(10000,100000,500)]



parameter = {
    'max_depth':max_depth,
    
}

In [91]:
randomsearch = RandomizedSearchCV(estimator=tree,param_distributions=parameter,cv=5,random_state=42,verbose=4)

In [92]:
randomsearch.fit(X,y)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] max_depth=19500 .................................................
[CV] ..................... max_depth=19500, score=0.216, total=   0.2s
[CV] max_depth=19500 .................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s


[CV] ..................... max_depth=19500, score=0.250, total=   0.2s
[CV] max_depth=19500 .................................................
[CV] ..................... max_depth=19500, score=0.492, total=   0.2s
[CV] max_depth=19500 .................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.6s remaining:    0.0s


[CV] ..................... max_depth=19500, score=0.229, total=   0.2s
[CV] max_depth=19500 .................................................
[CV] ..................... max_depth=19500, score=0.433, total=   0.2s
[CV] max_depth=31000 .................................................
[CV] ..................... max_depth=31000, score=0.236, total=   0.2s
[CV] max_depth=31000 .................................................
[CV] ..................... max_depth=31000, score=0.244, total=   0.2s
[CV] max_depth=31000 .................................................
[CV] ..................... max_depth=31000, score=0.501, total=   0.2s
[CV] max_depth=31000 .................................................
[CV] ..................... max_depth=31000, score=0.240, total=   0.2s
[CV] max_depth=31000 .................................................
[CV] ..................... max_depth=31000, score=0.450, total=   0.2s
[CV] max_depth=86500 .................................................
[CV] .

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    8.7s finished


RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=DecisionTreeRegressor(ccp_alpha=0.0,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features=None,
                                                   max_leaf_nodes=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   presort='deprecated',
                                                   random_state=None,
                                                   splitter='best'),
                   iid='d

In [93]:
randomsearch.best_params_

{'max_depth': 17500}

In [94]:

randomsearch.best_score_

0.3375153837861472

In [95]:
from sklearn.model_selection import GridSearchCV

In [96]:
grid_cv = GridSearchCV(tree,param_grid=parameter,cv=10,return_train_score=True)

In [97]:
grid_cv.fit(X,y)

GridSearchCV(cv=10, error_score=nan,
             estimator=DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse',
                                             max_depth=None, max_features=None,
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             presort='deprecated',
                                             random_state=None,
                                             splitter='best'),
             iid='deprecated', n_jobs=None,
             param_grid={'max_depth': [10000, 10500, 11000, 11500, 12000, 12500,
                                       13000, 13500, 14000, 14500, 15000,

In [99]:
grid_cv.best_params_

{'max_depth': 42000}

In [100]:
grid_cv.best_score_

0.2556560097741056