In [1]:
from sklearn.datasets import fetch_california_housing
import numpy as np
import pandas as pd
%matplotlib inline 
import matplotlib.pyplot as plt
import seaborn as sns



In [6]:
california=fetch_california_housing()


In [17]:
california

In [8]:
california.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'feature_names', 'DESCR'])

In [16]:
print(california.DESCR)

In [11]:
california.target_names

['MedHouseVal']

In [15]:
california.feature_names

In [14]:
california.target

array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894])

In [19]:
#lets prepare the datasets
dataset=pd.DataFrame(california.data,columns=california.feature_names)
dataset.head()

In [21]:
dataset['price']=california.target
dataset.head()

In [23]:
sns.heatmap(dataset.corr(),annot=True)

In [25]:
X=dataset.iloc[:,:-1]#Independent feature
y=dataset.iloc[:,-1]#dependent feature


In [29]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.33,random_state=42)

In [30]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((13828, 8), (6812, 8), (13828,), (6812,))

In [33]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_train_scaled=scaler.fit_transform(X_train)
X_test_scaled=scaler.transform(X_test)

In [36]:
X_train_scaled,X_test_scaled

# Model Training

In [37]:
from sklearn.linear_model import LinearRegression
regression=LinearRegression()
regression

In [38]:
regression.fit(X_train_scaled,y_train)

In [39]:
#Slope of 8 features
regression.coef_

array([ 8.46603472e-01,  1.20333548e-01, -2.98800785e-01,  3.47243173e-01,
       -8.81413334e-05, -4.17242067e-02, -8.94420371e-01, -8.70401317e-01])

In [40]:
#Intercept
regression.intercept_

2.0666362048018536

In [41]:
#prediction
y_pred_test=regression.predict(X_test_scaled)

In [42]:
y_pred_test

array([0.72563462, 1.76650223, 2.70545812, ..., 1.25803135, 1.66673014,
       2.25826279])

In [45]:
##Performance metrics cost function
from sklearn.metrics import mean_squared_error,mean_absolute_error
mse=mean_squared_error(y_test,y_pred_test)
mae=mean_absolute_error(y_test,y_pred_test)
rmse=np.sqrt(mse)


In [46]:
print(mse)
print(mae)
print(rmse)

0.5369686543372458
0.5295710106684451
0.7327814505957734


In [47]:
#R squared and Adjusted R squared
from sklearn.metrics import r2_score
score=r2_score(y_test,y_pred_test)
print(score)

0.5970494128783954


In [49]:
##Adjusted R square
1-(1-score)*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)

0.5965755624158093

In [53]:
import pickle
pickle.dump(scaler,open('scaler.pk1','wb'))
pickle.dump(regression,open('regressor.pk1','wb'))


In [55]:
model_regressor=pickle.load(open('regressor.pk1','rb'))
model_regressor.predict(X_test_scaled)

array([0.72563462, 1.76650223, 2.70545812, ..., 1.25803135, 1.66673014,
       2.25826279])

In [56]:
standard_scaler=pickle.load(open('scaler.pk1','rb'))


In [57]:
model_regressor.predict(standard_scaler.transform(X_test))

array([0.72563462, 1.76650223, 2.70545812, ..., 1.25803135, 1.66673014,
       2.25826279])