In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## Load California House Pricing Dataset

In [None]:
from sklearn.datasets import fetch_california_housing

In [None]:
california = fetch_california_housing()

In [None]:
california.keys()

In [None]:
## Check the DESCR of the dataset
print(california.DESCR)

In [None]:
print(california.data)

In [None]:
california.target

In [None]:
california.feature_names

 ## Preparing the Dataset

In [None]:
dataset = pd.DataFrame(california.data, columns = california.feature_names)

In [None]:
dataset.head()

In [None]:
dataset['Price'] = california.target

In [None]:
dataset.head()

In [None]:
dataset.info()

## Summarizing the stats of the Data

In [None]:
dataset.describe()

## Check Missing Values

In [None]:
dataset.isnull().sum()

### Exploratory Data Analysis

## Correlation

In [None]:
dataset.corr()

In [None]:
import seaborn as sns

In [None]:
sns.pairplot(dataset)

In [None]:
plt.scatter(dataset['MedInc'], dataset['Price'])
plt.xlabel('Median Income')
plt.ylabel('Price')

In [None]:
sns.regplot(data = dataset, x = 'MedInc', y = 'Price')

In [None]:
sns.regplot(data = dataset, x = 'Latitude', y = 'Price')

In [None]:
sns.regplot(data = dataset, x = 'Population', y = 'Latitude')

## Independent and Dependent Features

In [None]:
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1]

In [None]:
X.head()

In [None]:
y.head()

## Train Test Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [None]:
X_train

In [None]:
X_test

In [None]:
y_train

In [None]:
y_test

In [None]:
## Statndardize the dataset
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [None]:
X_train = scaler.fit_transform(X_train)

In [None]:
X_test = scaler.transform(X_test)

In [None]:
import pickle
pickle.dump(scaler, open('scaling.pkl', 'wb'))

In [None]:
X_train

In [None]:
X_test

## Model Training

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
reg = LinearRegression()

In [None]:
reg.fit(X_train, y_train)

In [None]:
## print the coefficients and the intercept
print(reg.coef_)

In [None]:
print(reg.intercept_)

In [None]:
## on which parameters the model has been trained
reg.get_params()

### Predictions with Test Data

In [None]:
reg_pred = reg.predict(X_test)

In [None]:
reg_pred

## Plot a scatter plot for predictions

In [None]:
plt.scatter(y_test, reg_pred)

## Residuals

In [None]:
residuals = y_test - reg_pred

In [None]:
residuals

## plot this Residuals

In [None]:
sns.displot(residuals, kind = 'kde')

## Scatter plot with prediction and residuals

In [None]:
plt.scatter(reg_pred, residuals)

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [None]:
print(mean_squared_error(y_test, reg_pred))
print(mean_absolute_error(y_test, reg_pred))
print(np.sqrt(mean_squared_error(y_test, reg_pred)))

# R squared and adjusted R square

In [None]:
from sklearn.metrics import r2_score
score = r2_score(y_test, reg_pred)

In [None]:
score

In [None]:
# display adjusted R-Squared
1 - (1-score)*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)

## New Data Prediction

In [None]:
california.data[0].reshape(1,-1).shape

In [None]:
scaler.transform(california.data[0].reshape(1, -1))

In [None]:
reg.predict(scaler.transform(california.data[0].reshape(1, -1)))

## Pickling the model file for deployment

In [None]:
import pickle

In [None]:
pickle.dump(reg, open('regmodel.pkl', 'wb'))

In [None]:
pickled_model = pickle.load(open('regmodel.pkl', 'rb'))

In [None]:
pickled_model.predict(scaler.transform(california.data[0].reshape(1, -1)))