In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import cv2
from pandas.plotting import scatter_matrix
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

In [None]:
Data_PATH = "../input/california-housing-prices"

In [None]:
def load_data(path):
    csv_path = os.path.join(Data_PATH,path)
    return pd.read_csv(csv_path)

In [None]:
housing = load_data("housing.csv")
housing.head()

In [None]:
housing.info()

In [None]:
housing["ocean_proximity"].value_counts()

In [None]:
housing.describe()

In [None]:
housing.hist(bins=50,figsize=(15,15))
plt.show()

In [None]:
housing.plot(kind="scatter",x="longitude", y = "latitude", alpha = 0.4,
            s=housing["population"]/100, label="population", figsize=(10,7),
            c="median_house_value", cmap=plt.get_cmap('jet'),colorbar = True)
plt.title("Geospatial Visualization")
plt.legend()
plt.show()

In [None]:
Position_img = cv2.cvtColor(cv2.imread("../input/image/California.png"),cv2.COLOR_BGR2RGB)
housing.plot(kind="scatter",x="longitude", y = "latitude", alpha = 0.4,
            s=housing["population"]/100, label="population", figsize=(10,7),
            c="median_house_value", cmap=plt.get_cmap('jet'),colorbar = True)
plt.imshow(Position_img, extent=[-124.55, -113.80, 32.45, 42.05], alpha=0.5)
plt.title("Geospatial Visualization")
plt.legend()
plt.show()

In [None]:
scatter_matrix(housing,figsize=(15,20))
plt.show()

In [None]:
corr_matrix = housing.corr()
corr_matrix

In [None]:
plt.figure(figsize=(15,15))
sns.heatmap(corr_matrix,annot=True)
plt.show()

In [None]:
housing_median = housing["total_bedrooms"].median()
housing["total_bedrooms"].fillna(housing_median,inplace=True)
################### OR Using Sklearn ##############
# imputer = Imputer(np.nan,strategy ="median")
# imputer.fit(housing.iloc[:,4:5])
# housing.iloc[:,4:5] = imputer.transform(housing.iloc[:,4:5])
# housing.isnull().sum()

housing.info()

In [None]:
count = housing.ocean_proximity.value_counts()
plt.figure(figsize=(10, 6))
plt.bar(count.index, height = count)
plt.show()

In [None]:
housing_prepared = pd.get_dummies(housing,columns=['ocean_proximity'])
housing_prepared.head()

In [None]:
X = housing_prepared.drop("median_house_value",axis=1)
y = housing_prepared["median_house_value"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
scale = StandardScaler()
X_train = scale.fit_transform(X_train)
X_test = scale.transform(X_test)

In [None]:
lin_reg_model = LinearRegression()
lin_reg_model.fit(X_train,y_train)

print(f"Coefficients are: {lin_reg_model.coef_}")
print(f"Intrception is: {lin_reg_model.intercept_}")

In [None]:
y_pred = lin_reg_model.predict(X_test)
print("Predicted Values are:")
print(y_pred[:5])
print("Actual values are: ")
print(y_test[:5])

In [None]:
y_train_pred = lin_reg_model.predict(X_train)
lin_mse = mean_squared_error(y_train,y_train_pred)
lin_rmse = np.sqrt(lin_mse)
print(f"Train RMSE: {lin_rmse}")

y_test_pred = lin_reg_model.predict(X_test)
lin_mse = mean_squared_error(y_test,y_test_pred)
lin_rmse = np.sqrt(lin_mse)
print(f"Test RMSE: {lin_rmse}")

In [None]:
print(f"Train Accuracy: {lin_reg_model.score(X_train,y_train)}")
print(f"Test Accuracy: {lin_reg_model.score(X_test,y_test)}")

## Underfitting

## Decision Tree Model

In [None]:
tree_reg = DecisionTreeRegressor()
tree_reg.fit(X_train,y_train)
housing_pred = tree_reg.predict(X_train)
tree_mse = mean_squared_error(y_train,housing_pred)
tree_rmse = np.sqrt(tree_mse)
print(f"Train RMSE: {tree_rmse}")

In [None]:
print(f"Train Accuracy: {tree_reg.score(X_train,y_train)}")
print(f"Test Accuracy: {tree_reg.score(X_test,y_test)}")

## Overfitting

## Cross Validation of Decision tree

In [None]:
scores = cross_val_score(tree_reg,X_train,y_train,scoring="neg_mean_squared_error",cv=10)
tree_rmse_scores = np.sqrt(-scores)

def display_scores(scores):
    print("Scores: ", scores)
    print("Mean: ", scores.mean())
    print("Standard deviation: ", scores.std())
    
display_scores(tree_rmse_scores)

## Cross Validation of Linear Regression

In [None]:
lin_scores = cross_val_score(lin_reg_model,X_train,y_train,scoring="neg_mean_squared_error",cv=10)
lin_rmse_scores = np.sqrt(-scores)

display_scores(lin_rmse_scores)

## Random Forest Regressor

In [None]:
forest_reg = RandomForestRegressor()
forest_reg.fit(X_train,y_train)
housing_pred = forest_reg.predict(X_train)
forest_mse = mean_squared_error(y_train,housing_pred)
forest_rmse = np.sqrt(forest_mse)
print(f"Train RMSE: {forest_rmse}")

In [None]:
print(f"Train Accuracy: {forest_reg.score(X_train,y_train)}")
print(f"Test Accuracy: {forest_reg.score(X_test,y_test)}")

In [None]:
scores = cross_val_score(forest_reg,X_train,y_train,scoring="neg_mean_squared_error",cv=10)
forest_rmse_scores = np.sqrt(-scores)
display_scores(forest_rmse_scores)