### Importing Dependencies

In [32]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
%matplotlib inline


### Data Collection and Processing

In [28]:
house_price_dataset=pd.read_csv("house_price_prediction.csv")
print(house_price_dataset)

    Area (sqft)  Bedrooms  Bathrooms  Stories  Parking  Price
0           850         2          1        1        1     75
1           900         2          2        1        1     80
2          1200         3          2        2        2    120
3          1500         3          3        2        2    150
4           800         2          1        1        1     70
5          1100         3          2        2        2    110
6          1300         3          2        2        2    135
7           950         2          2        1        1     85
8          1400         4          3        2        2    145
9          1000         3          2        2        1     95
10         1250         3          2        2        2    125
11         1600         4          3        2        2    160
12         1700         4          3        3        2    175
13         1450         3          2        2        2    140
14         1150         3          2        2        2    115
15      

In [4]:
house_price_dataset.head()

Unnamed: 0,Area (sqft),Bedrooms,Bathrooms,Stories,Parking,Price
0,850,2,1,1,1,75
1,900,2,2,1,1,80
2,1200,3,2,2,2,120
3,1500,3,3,2,2,150
4,800,2,1,1,1,70


In [5]:
house_price_dataset.tail()

Unnamed: 0,Area (sqft),Bedrooms,Bathrooms,Stories,Parking,Price
15,980,2,1,1,1,90
16,1230,3,2,2,2,122
17,1550,4,3,3,2,155
18,850,2,1,1,1,78
19,1350,3,2,2,2,130


In [6]:
house_price_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   Area (sqft)  20 non-null     int64
 1   Bedrooms     20 non-null     int64
 2   Bathrooms    20 non-null     int64
 3   Stories      20 non-null     int64
 4   Parking      20 non-null     int64
 5   Price        20 non-null     int64
dtypes: int64(6)
memory usage: 1.1 KB


In [7]:
house_price_dataset.isnull().sum()

Area (sqft)    0
Bedrooms       0
Bathrooms      0
Stories        0
Parking        0
Price          0
dtype: int64

### Splitting the data and Target

In [10]:
x=house_price_dataset.drop(["Price"],axis=1)
y=house_price_dataset["Price"]

In [11]:
print(x)

    Area (sqft)  Bedrooms  Bathrooms  Stories  Parking
0           850         2          1        1        1
1           900         2          2        1        1
2          1200         3          2        2        2
3          1500         3          3        2        2
4           800         2          1        1        1
5          1100         3          2        2        2
6          1300         3          2        2        2
7           950         2          2        1        1
8          1400         4          3        2        2
9          1000         3          2        2        1
10         1250         3          2        2        2
11         1600         4          3        2        2
12         1700         4          3        3        2
13         1450         3          2        2        2
14         1150         3          2        2        2
15          980         2          1        1        1
16         1230         3          2        2        2
17        

In [12]:
print(y)

0      75
1      80
2     120
3     150
4      70
5     110
6     135
7      85
8     145
9      95
10    125
11    160
12    175
13    140
14    115
15     90
16    122
17    155
18     78
19    130
Name: Price, dtype: int64


### Splitting Training and Test data

In [15]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.33,random_state=42)

### Model Training

In [20]:
regressor=DecisionTreeRegressor()

In [22]:
regressor.fit(x_train,y_train)

### Model Evaluation

In [23]:
y_pred=regressor.predict(x_test)

In [24]:
print(y_pred)

[ 78. 175.  85.  78. 130. 115. 150.]


### Performance Metrics

In [29]:
score=r2_score(y_pred,y_test)
print(score)

0.9092464626521882


### HyperParameter Tunning

In [30]:
parameter={
    "criterion":["squared_error","friedman_mse","absolute_error","poisson"],
    "splitter":["best","random"],
    "max_depth":[1,2,3,4,5,6,7,8,10,11,12],
    "max_features":["auto","sqrt","log2"]
}
regressor=DecisionTreeRegressor()

In [35]:
regressorcv=GridSearchCV(regressor,param_grid=parameter,cv=5,scoring="neg_mean_squared_error")

In [36]:
regressorcv.fit(x_train,y_train)

440 fits failed out of a total of 1320.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
440 fits failed with the following error:
Traceback (most recent call last):
  File "C:\ProgramData\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\anaconda3\Lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "C:\ProgramData\anaconda3\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\ProgramData\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParam

In [37]:
regressorcv.best_params_

{'criterion': 'friedman_mse',
 'max_depth': 10,
 'max_features': 'log2',
 'splitter': 'random'}

In [38]:
y_pred=regressorcv.predict(x_test)

In [39]:
r2_score(y_pred,y_test)

0.9240988978254394