### Attribute Information:

    1. CRIM      per capita crime rate by town
    2. ZN        proportion of residential land zoned for lots over 
                 25,000 sq.ft.
    3. INDUS     proportion of non-retail business acres per town
    4. CHAS      Charles River dummy variable (= 1 if tract bounds 
                 river; 0 otherwise)
    5. NOX       nitric oxides concentration (parts per 10 million)
    6. RM        average number of rooms per dwelling
    7. AGE       proportion of owner-occupied units built prior to 1940
    8. DIS       weighted distances to five Boston employment centres
    9. RAD       index of accessibility to radial highways
    10. TAX      full-value property-tax rate per $10,000
    11. PTRATIO  pupil-teacher ratio by town
    12. B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks 
                 by town
    13. LSTAT    % lower status of the population
    14. MEDV     Median value of owner-occupied homes in $1000's


In [1]:
import pandas as pd
import numpy as np

In [2]:
house_data = pd.read_csv("data.csv")

In [3]:
house_data.head(10)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2
5,0.02985,0.0,2.18,0,0.458,6.43,58.7,6.0622,3,222,18.7,394.12,5.21,28.7
6,0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311,15.2,395.6,12.43,22.9
7,0.14455,12.5,7.87,0,0.524,6.172,96.1,5.9505,5,311,15.2,396.9,19.15,27.1
8,0.21124,12.5,7.87,0,0.524,5.631,100.0,6.0821,5,311,15.2,386.63,29.93,16.5
9,0.17004,12.5,7.87,0,0.524,6.004,85.9,6.5921,5,311,15.2,386.71,17.1,18.9


In [4]:
house_data.isna().sum()

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
MEDV       0
dtype: int64

In [5]:
house_data.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,21.2
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


In [6]:
from sklearn.model_selection import train_test_split
train_data,test_data = train_test_split(house_data,test_size = 0.2,random_state = 42)

In [7]:
len(train_data)

404

In [8]:
len(test_data)

102

In [9]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)
for train_index,test_index in split.split(house_data,house_data["CHAS"]):
    stratified_train_set = house_data.loc[train_index]
    stratified_test_set = house_data.loc[test_index]


In [10]:
stratified_train_set

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
254,0.04819,80.0,3.64,0,0.392,6.108,32.0,9.2203,1,315,16.4,392.89,6.57,21.9
348,0.01501,80.0,2.01,0,0.435,6.635,29.7,8.3440,4,280,17.0,390.94,5.99,24.5
476,4.87141,0.0,18.10,0,0.614,6.484,93.6,2.3053,24,666,20.2,396.21,18.68,16.7
321,0.18159,0.0,7.38,0,0.493,6.376,54.3,4.5404,5,287,19.6,396.90,6.87,23.1
326,0.30347,0.0,7.38,0,0.493,6.312,28.9,5.4159,5,287,19.6,396.90,6.15,23.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155,3.53501,0.0,19.58,1,0.871,6.152,82.6,1.7455,5,403,14.7,88.01,15.02,15.6
423,7.05042,0.0,18.10,0,0.614,6.103,85.1,2.0218,24,666,20.2,2.52,23.29,13.4
98,0.08187,0.0,2.89,0,0.445,7.820,36.9,3.4952,2,276,18.0,393.53,3.57,43.8
455,4.75237,0.0,18.10,0,0.713,6.525,86.5,2.4358,24,666,20.2,50.92,18.13,14.1


In [11]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")
imputer.fit(stratified_train_set)

SimpleImputer(strategy='median')

In [12]:
x = imputer.transform(stratified_train_set)

In [13]:
transform_training_data = pd.DataFrame(x,columns=stratified_train_set.columns)

In [14]:
transform_training_data.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
count,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0
mean,3.602814,10.836634,11.34495,0.069307,0.558064,6.279908,69.039851,3.74621,9.735149,412.341584,18.473267,353.392822,12.791609,22.509406
std,8.099383,22.150636,6.877817,0.25429,0.116875,0.712983,28.258248,2.099057,8.731259,168.672623,2.129243,96.069235,7.23574,9.385531
min,0.00632,0.0,0.74,0.0,0.389,3.561,2.9,1.1296,1.0,187.0,13.0,0.32,1.73,5.0
25%,0.086962,0.0,5.19,0.0,0.453,5.87875,44.85,2.035975,4.0,284.0,17.4,374.6175,6.8475,16.6
50%,0.286735,0.0,9.9,0.0,0.538,6.21,78.2,3.1222,5.0,337.0,19.0,390.955,11.57,21.15
75%,3.731923,12.5,18.1,0.0,0.631,6.63025,94.1,5.1004,24.0,666.0,20.2,395.63,17.1025,25.0
max,73.5341,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,36.98,50.0


In [15]:
model_x = transform_training_data.drop("MEDV",axis=1)
model_y = transform_training_data["MEDV"]

In [16]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
model_pipeline = Pipeline([
    ("imputer",SimpleImputer(strategy="median")),
    ("standard_scaller",StandardScaler())
])

In [25]:
model_data = model_pipeline.fit_transform(model_x)

In [26]:
model_data.shape

(404, 13)

In [27]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(model_data,model_y)

RandomForestRegressor()

In [35]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
score = cross_val_score(model,model_data,model_y,scoring="neg_mean_squared_error",cv = 10)
rmse_score = np.sqrt(-score)

In [29]:
rmse_score

array([2.79007654, 2.62310615, 4.28748424, 2.54549581, 3.58666911,
       2.71756429, 4.87219388, 3.31146274, 3.13832208, 3.27519144])

In [30]:
score.std()

5.290496166856724

In [31]:
rmse_score.mean()

3.314756628278618

In [38]:
x_test = stratified_test_set.drop("MEDV",axis=1)
y_test = stratified_test_set["MEDV"]
x_test = model_pipeline.transform(x_test)
model_preda = model.predict(x_test)
mse = mean_squared_error(model_preda,y_test)
root_mean_squared_error = np.sqrt(mse)

In [39]:
root_mean_squared_error

2.9596109823195778

In [40]:
model_preda

array([25.151, 11.242, 25.845, 21.558, 18.73 , 15.093, 19.772, 14.673,
       32.579, 42.102, 20.058, 11.923, 24.417, 28.278, 19.327, 10.967,
       31.866, 14.294, 23.514, 18.96 , 19.558, 18.164, 18.186, 21.859,
       18.105, 31.267, 16.033, 33.565,  8.834, 34.203, 23.683, 21.6  ,
       22.938, 10.5  , 21.329, 10.989, 43.841, 24.443, 23.698, 42.097,
       24.055, 29.831, 20.175, 20.796, 19.235, 33.575, 43.944, 20.018,
       20.414, 21.711, 21.356, 14.432, 21.252, 15.276, 25.373, 32.863,
       41.483, 29.665, 19.431, 20.64 , 46.867, 10.03 , 18.97 , 25.351,
       14.905, 33.264, 19.95 , 18.111, 19.128, 33.89 , 26.26 , 22.905,
       21.357, 22.399, 35.061, 13.062, 15.736, 20.03 , 20.878, 21.412,
       22.607, 20.82 , 14.52 , 22.573, 20.575, 21.547, 13.977, 20.931,
       21.523, 23.28 , 18.386, 27.162,  7.224, 26.351, 18.859, 29.696,
       19.949, 30.92 , 14.73 , 26.414, 21.239, 19.96 ])

In [41]:
y_test

342    16.5
379    10.2
223    30.1
219    23.0
48     14.4
       ... 
88     23.6
466    19.0
52     25.0
121    20.3
218    21.5
Name: MEDV, Length: 102, dtype: float64

In [51]:
print(model.score(x_test,y_test)*100,"%")

87.61791592293598 %


In [50]:
model_preda.shape

(102,)

In [46]:
y_test.shape

(102,)