# Housing Price Prediction

## Importing the data

In [8]:
import pandas as pd
raw_data = pd.read_csv('housing.csv')

In [9]:
raw_data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


## Let's check the correlartions!

In [10]:
correlation = raw_data.corr()
correlation['median_house_value'].sort_values(ascending = False)

median_house_value    1.000000
median_income         0.688075
total_rooms           0.134153
housing_median_age    0.105623
households            0.065843
total_bedrooms        0.049686
population           -0.024650
longitude            -0.045967
latitude             -0.144160
Name: median_house_value, dtype: float64

## Plotting histograms of some promising attributes from the correlations above!

In [None]:
from pandas.plotting import scatter_matrix
attributes = ['median_house_value',
              'median_income',
              'total_rooms',
              'housing_median_age'
]
scatter_matrix(raw_data[attributes], figsize = (12, 8))

## Check for null values!

In [None]:
raw_data.isnull().sum()

## Fill the null values with the mean!

In [None]:
median = raw_data['total_bedrooms'].median()
raw_data['total_bedrooms'] = raw_data['total_bedrooms'].fillna(median)
raw_data.isnull().sum()

## Check if we could find some more insights about the data!

In [None]:
raw_data.info()

## Adding more attributes that might be proven usefull!

In [None]:
raw_data['bedrooms_per_rooms'] = raw_data['total_bedrooms'] / raw_data['total_rooms']
raw_data['population_per_household'] = raw_data['population'] / raw_data['households']
raw_data['rooms_per_household'] = raw_data['total_rooms'] / raw_data['households']
raw_data

## Checking for correlations!

In [None]:
correlation = raw_data.corr()
correlation['median_house_value'].sort_values(ascending = False)

## Handling categorical data

In [None]:
raw_data['ocean_proximity'].unique()

## Using Label Encoder on categorical column

In [None]:
from sklearn.preprocessing import LabelEncoder
categorical = raw_data['ocean_proximity']
l_encode = LabelEncoder()
categorical = l_encode.fit_transform(categorical)

## OneHotEncoder would give better results

In [None]:
from sklearn.preprocessing import OneHotEncoder
o_encode = OneHotEncoder()
categorical_1 = o_encode.fit_transform(categorical.reshape(-1,1))

In [None]:
categorical = pd.DataFrame(categorical_1.toarray())

## Adding the encoded column to main data frame!

In [None]:
concatenated_1 = pd.concat([raw_data, categorical.astype(int)], axis = 1)
concatenated_1

In [None]:
housing = concatenated_1
housing.drop(['ocean_proximity'], axis = 1, inplace = True)

In [None]:
housing

## Last check for any missing values!

In [None]:
housing.isnull().sum()

## Splitting the data into train and test

In [None]:
from sklearn.model_selection import train_test_split
X = housing.drop(['median_house_value'], axis = 1)
y = housing['median_house_value']
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = 0.2)

# Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
linear = LinearRegression()
linear.fit(train_X, train_y)
predict_linear = pd.DataFrame(linear.predict(test_X))

In [None]:
from sklearn.metrics import mean_squared_error
import numpy as np
linear_error = np.sqrt(mean_squared_error(test_y, predict_linear))
linear_error

# Decision Tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor
tree = DecisionTreeRegressor()
tree.fit(train_X, train_y)
predict_tree = tree.predict(test_X)

In [None]:
tree_error = np.sqrt(mean_squared_error(test_y, predict_tree))
tree_error

## Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor()
forest.fit(train_X, train_y)
predict_forest = forest.predict(test_X)

In [None]:
forest_error = np.sqrt(mean_squared_error(test_y, predict_forest))
forest_error

In [None]:
import sklearn.externals 
import joblib
joblib.dump(forest, 'housing-price-prediction.joblib')