In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('../input/housing/housing.csv')
df.head()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.ocean_proximity.value_counts()

In [None]:
df2 = pd.get_dummies(df, columns= ['ocean_proximity'])
df2.head()

In [None]:
df2.info()

In [None]:
sns.heatmap(df.corr(), annot= True)

In [None]:
df.hist(figsize=(15,12))

In [None]:
df2.isnull().sum()

In [None]:
df2['total_bedrooms'] = df2['total_bedrooms'].fillna(df2['total_bedrooms'].mean())
df2.isnull().sum()

In [None]:
df2 = df2.drop(['longitude', 'latitude'], axis = 1)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [None]:
x = df2.drop('median_house_value', axis = 1)
y = df2['median_house_value']

In [None]:
x_scaled = sc.fit_transform(x.values)
y_scaled = sc.fit_transform(y.values.reshape(-1, 1)).flatten()

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y_scaled, test_size = 0.33, random_state = 1)

In [None]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

In [None]:
lr.fit(x_train, y_train)

In [None]:
y_pred = lr.predict(x_test)
y_pred

In [None]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [None]:
## Test Score
r2_score(y_test, y_pred)

In [None]:
## Train Score
r2_score(y_train, lr.predict(x_train))

In [None]:
mse = mean_squared_error(y_test, y_pred)
mse

In [None]:
mae = mean_absolute_error(y_test, y_pred)
mae

In [None]:
from sklearn.model_selection import cross_val_score
cross_val_score(lr, x_train, y_train, cv = 10)

In [None]:
## Test Score
cv_score = r2_score(y_test, lr.predict(x_test))
cv_score

In [None]:
lr.coef_

In [None]:
pd.DataFrame(lr.coef_, index= x.columns, columns= ['Coefficients']).sort_values(ascending = False, by = 'Coefficients')

In [None]:
from sklearn.linear_model import Ridge
rr = Ridge()

In [None]:
rr.fit(x_train, y_train)

In [None]:
## Train Score
r2_score(y_train, rr.predict(x_train))

In [None]:
## Test Score
ridge_score = r2_score(y_test, rr.predict(x_test))
ridge_score

In [None]:
rr.coef_

In [None]:
pd.DataFrame(rr.coef_, index= x.columns, columns= ['Coefficients']).sort_values(ascending = False, by = 'Coefficients')

In [None]:
from sklearn.tree import DecisionTreeRegressor
dtree = DecisionTreeRegressor()
dtree.fit(x_train, y_train)

In [None]:
## Train Score
r2_score(y_train, dtree.predict(x_train))

In [None]:
## since we got 99%. It seems to be overfit hence we will do cross validation
cross_val_score(dtree, x_train, y_train, cv=10)

In [None]:
## Test Score
dtree_score = r2_score(y_test, dtree.predict(x_test))
dtree_score

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()

In [None]:
rf.fit(x_train, y_train)

In [None]:
## Train Score
r2_score(y_train, rf.predict(x_train))

In [None]:
## since we got 94%. It seems to be overfit hence we will do cross validation
cross_val_score(rf, x_train, y_train, cv=10)

In [None]:
## Test Score
rf_score = r2_score(y_test, rf.predict(x_test))
rf_score

In [None]:
algorithm = [cv_score, ridge_score, dtree_score, rf_score]
index = ['Cross Validation','Ridge Regression', 'Decision Tree', 'Random Forest']
pd.DataFrame(algorithm, index=index, columns=['Scores']).sort_values(ascending = False, by=['Scores'])