In [None]:
import pandas as pd
housing_price = pd.read_csv("../input/boston-housing-data/data.csv")
housing_price.head()

In [None]:
housing_price.shape

Checking for null values (if any)

In [None]:
housing_price.isnull().sum()

In [None]:
housing_price.info()

In [None]:
housing_price.describe()

# Plotting Histograms of the features

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
housing_price.hist(bins = 50 , figsize = (20,15))

# Train Test Splitting

In [None]:
from sklearn.model_selection import train_test_split
X_train ,X_test = train_test_split(housing_price , test_size = 0.25 , random_state = 42)
len(X_train),len(X_test)


# Stratified Shuffle Splitting

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing_price, housing_price['CHAS']):
    strat_train_set = housing_price.loc[train_index]
    strat_test_set = housing_price.loc[test_index]
    

In [None]:
strat_train_set['CHAS'].value_counts()

In [None]:
strat_test_set['CHAS'].value_counts()

# Plotting Scatter Matrix

In [None]:
from pandas.plotting import scatter_matrix
attributes = ["MEDV", "RM", "ZN", "LSTAT"]
scatter_matrix(housing_price[attributes] , figsize = (10,10))


# Heatmap Plotting

In [None]:
import seaborn as sns

In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(housing_price.corr().abs(),  annot=True)

## Dropping "MEDV"

In [None]:
housing_price = strat_train_set.drop("MEDV" , axis =1)
housing_price_labels = strat_train_set["MEDV"].copy()

# Imputing Median Strategy

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")
imputer.fit(housing_price)


In [None]:
imputer.statistics_

Transforming the Data

In [None]:
A = imputer.transform(housing_price)


In [None]:
housing_price_tr = pd.DataFrame(A, columns=housing_price.columns)
housing_price_tr.describe()

## Creating a Model_1 (Linear Regression)

In [None]:
from sklearn.linear_model import LinearRegression
model_1 = LinearRegression()
model_1.fit(housing_price_tr , housing_price_labels)

In [None]:
model_1.predict(housing_price_tr)

## Evaluation of Linear Regression Model

In [None]:
import numpy as np

In [None]:
from sklearn.metrics import mean_squared_error
predictions = model_1.predict(housing_price_tr)
error = mean_squared_error(housing_price_labels , predictions)
rmse = np.sqrt(error)

In [None]:
rmse

## Creating another Model_2 (Random Forest Regressor)

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
model_2 = RandomForestRegressor()
model_2.fit(housing_price_tr , housing_price_labels)

In [None]:
model_2.predict(housing_price_tr)

## Evaluation of Random Forest Regressor

In [None]:
predictions = model_2.predict(housing_price_tr)
error = mean_squared_error(housing_price_labels , predictions)
rmse = np.sqrt(error)


In [None]:
rmse

From the above two models , model_1 and model_2 we can see that model_2 i.e. Random Forest Regressor is clearly showing a better ROOTED MEAN SQUARE ERROR(rmse) and also is not overfitted. So we will use model_2 for testing the Test Data.

## Testing our model_2 (Random Forest Regressor)

In [None]:
X_test = strat_test_set.drop("MEDV", axis=1)
y_test = strat_test_set["MEDV"].copy()
test_data_predictions = model_2.predict(X_test)
error_final = mean_squared_error(y_test , test_data_predictions)
rmse_final = np.sqrt(error_final)

In [None]:
test_data_predictions

In [None]:
rmse_final