In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import StratifiedShuffleSplit

In [2]:
#Reading the Data

housing_df  = pd.read_csv("data.csv")

In [3]:
#Splitting the data into test and train sets

split = StratifiedShuffleSplit(n_splits=1, test_size=0.15, random_state=42)

for train_index, test_index in split.split(housing_df, housing_df['CHAS']):
    strat_train_set = housing_df.loc[train_index]
    strat_test_set = housing_df.loc[test_index]

In [4]:
#checking coorelations

coor_matrix = strat_train_set.corr()
coor_matrix['MEDV'].sort_values(ascending = False)

MEDV       1.000000
RM         0.692316
B          0.356281
ZN         0.340606
DIS        0.246130
CHAS       0.191143
AGE       -0.360431
RAD       -0.376433
CRIM      -0.393317
NOX       -0.427752
TAX       -0.464745
INOUS     -0.478351
PTRATIO   -0.497006
LSTAT     -0.741615
Name: MEDV, dtype: float64

In [5]:
housing = strat_train_set.drop("MEDV", axis = 1)
housing_labels = strat_train_set['MEDV'].copy()

SCIKIT LEARN: 
1. Estimators: Estimates parameters based on the dataset
Fit - claculates parameters and fits dataset

2. Transformers: Gives outputs based on learnings from fit()
fit_transform() - fits then transforms

3. Predictors has fit(), predict() and score()

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

my_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy = 'median')),
    ('std_scaler', StandardScaler())
    
])

In [7]:
#Imputing for misssing values

housing_num = my_pipeline.fit_transform(housing)

In [8]:
#selecting the model

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
model = RandomForestRegressor()
model.fit(housing_num, housing_labels)

RandomForestRegressor()

In [9]:
housing_predictions = model.predict(housing_num)
mse = mean_squared_error(housing_labels, housing_predictions)
rmse = np.sqrt(mse)
print(rmse)

1.2481814753267395


In [10]:
#Cross Validation

from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, housing_num, housing_labels, scoring = "neg_mean_squared_error", cv = 10)
rmse_scores = np.sqrt(-scores)

print(rmse_scores)

[2.90747998 2.8248946  2.97493697 2.70042158 2.99863486 2.74915219
 5.4183665  2.81154813 3.55632615 3.3851061 ]


In [11]:
def print_scores(scores):
    print("Scores: ", scores)
    print("Mean: ", scores.mean())
    print("Standard dev: ", scores.std())

In [12]:
print_scores(rmse_scores) 

Scores:  [2.90747998 2.8248946  2.97493697 2.70042158 2.99863486 2.74915219
 5.4183665  2.81154813 3.55632615 3.3851061 ]
Mean:  3.2326867077639405
Standard dev:  0.7741388017210186


In [None]:
#Linear Regressor
'''
Scores:  [4.92076578 3.66209052 4.87432846 3.43131855 5.51149488 4.9301172
 6.63832359 4.90341979 4.81343358 5.43232885]
Mean:  4.911762121417897
Standard dev:  0.8588732867468863
'''

#Decision tree Regressor
'''
Scores:  [4.90170829 3.92303869 3.97743636 3.58909848 3.4737889  3.74588146
 6.0882655  4.00572265 6.64531659 4.04661788]
Mean:  4.439687479573985
Standard dev:  1.0371755990423603
'''

#Random Forest Regressor
'''
Scores:  [2.90747998 2.8248946  2.97493697 2.70042158 2.99863486 2.74915219
 5.4183665  2.81154813 3.55632615 3.3851061 ]
Mean:  3.2326867077639405
Standard dev:  0.7741388017210186
'''

In [17]:
#saving the model

from joblib import dump, load
dump(model, 'housing.joblib')

['housing.joblib']

In [18]:
#testing the model

X_test = strat_test_set.drop('MEDV', axis = 1)
Y_test = strat_test_set['MEDV'].copy()

X_test_prepared = my_pipeline.transform(X_test)
final_predictions = model.predict(X_test_prepared)

final_mse = mean_squared_error(Y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

print("Final rmse: ", final_rmse)

Final rmse:  3.0830751288283604


In [20]:
#using the model

from joblib import dump, load
import numpy as np

model = load('housing.joblib')
features = np.array([X_test_prepared[1]])
model.predict(features)

array([23.334])