In [126]:
#importing california housing dataset
import pandas as pd
from sklearn.datasets import fetch_california_housing
cal=fetch_california_housing()
housing=pd.DataFrame(cal.data,columns=cal.feature_names)
housing["target"]=cal.target

In [127]:
housing.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [128]:
housing.isnull().sum()

MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
target        0
dtype: int64

In [129]:
#cleaning data
#dropping longitude and latitude as they dont impact very much of the target variable
housing.drop("Latitude",axis=1,inplace=True)
housing.drop("Longitude",axis=1,inplace=True)

In [130]:
housing.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,3.422


In [131]:
housing.corr()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,target
MedInc,1.0,-0.119034,0.326895,-0.06204,0.004834,0.018766,0.688075
HouseAge,-0.119034,1.0,-0.153277,-0.077747,-0.296244,0.013191,0.105623
AveRooms,0.326895,-0.153277,1.0,0.847621,-0.072213,-0.004852,0.151948
AveBedrms,-0.06204,-0.077747,0.847621,1.0,-0.066197,-0.006181,-0.046701
Population,0.004834,-0.296244,-0.072213,-0.066197,1.0,0.069863,-0.02465
AveOccup,0.018766,0.013191,-0.004852,-0.006181,0.069863,1.0,-0.023737
target,0.688075,0.105623,0.151948,-0.046701,-0.02465,-0.023737,1.0


In [132]:
#function for finding the highly correlated features-
def correlation(df,threshold):
    col_corr=set()
    corr_matrix=df.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i,j])>threshold:
                col_corr.add(corr_matrix.columns[i])
    return col_corr

In [133]:
corr_features=correlation(housing.drop("target",axis=1),0.7)
corr_features

{'AveBedrms'}

In [134]:
#dropping the correlated features having threshold greater than 0.7
housing.drop("AveBedrms",axis=1,inplace=True)

In [135]:
housing.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,Population,AveOccup,target
0,8.3252,41.0,6.984127,322.0,2.555556,4.526
1,8.3014,21.0,6.238137,2401.0,2.109842,3.585
2,7.2574,52.0,8.288136,496.0,2.80226,3.521
3,5.6431,52.0,5.817352,558.0,2.547945,3.413
4,3.8462,52.0,6.281853,565.0,2.181467,3.422


In [136]:
#scaling the data using z score/ standard scaler for the data to be distributed normally / gaussian distribution
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()

In [137]:
housing_numpy=scaler.fit_transform(housing.drop("target",axis=1))

In [138]:
housing.drop("target",axis=1,inplace=True)

In [139]:
hous=pd.DataFrame(housing_numpy,columns=housing.columns)

In [140]:
hous.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,Population,AveOccup
0,2.344766,0.982143,0.628559,-0.974429,-0.049597
1,2.332238,-0.607019,0.327041,0.861439,-0.092512
2,1.782699,1.856182,1.15562,-0.820777,-0.025843
3,0.932968,1.856182,0.156966,-0.766028,-0.050329
4,-0.012881,1.856182,0.344711,-0.759847,-0.085616


In [144]:
target=pd.DataFrame(cal.target)
#target_scaled=scaler.fit_transform(cal.target)

In [145]:
second_scaler=StandardScaler()
hous["target"]=second_scaler.fit_transform(target)

In [146]:
hous.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,Population,AveOccup,target
0,2.344766,0.982143,0.628559,-0.974429,-0.049597,2.129631
1,2.332238,-0.607019,0.327041,0.861439,-0.092512,1.314156
2,1.782699,1.856182,1.15562,-0.820777,-0.025843,1.258693
3,0.932968,1.856182,0.156966,-0.766028,-0.050329,1.1651
4,-0.012881,1.856182,0.344711,-0.759847,-0.085616,1.1729


In [147]:
#implementing single linear regression from scratch
# for a single linear regression only independent feature is taken
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(hous.drop("target",axis=1),hous.target,test_size=0.2,random_state=100)

In [148]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((16512, 5), (4128, 5), (16512,), (4128,))

In [149]:
import numpy as np

In [79]:
class LinearRegression:
    def __init__(self, learning_rate=0.001, n_iters=1000):
        self.lr = learning_rate
        self.n_iters = n_iters
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        n_samples, n_features = X.shape

        # init parameters
        self.weights = np.zeros(n_features)
        self.bias = 0

        # gradient descent
        for _ in range(self.n_iters):
            y_predicted = np.dot(X, self.weights) + self.bias
            # compute gradients
            dw = (1 / n_samples) * np.dot(X.T, (y_predicted - y))
            db = (1 / n_samples) * np.sum(y_predicted - y)

            # update parameters
            self.weights -= self.lr * dw
            self.bias -= self.lr * db

    def predict(self, X):
        y_approximated = np.dot(X, self.weights) + self.bias
        return y_approximated

In [150]:
class LinearRegression:
    def __init__(self,learning_rate,iterations):
        self.learning_rate=learning_rate
        self.iterations=iterations
        self.weights=None
        self.bias=None
    def fit(self,X,y):
        n_samples,n_features=X.shape

        self.weights=np.zeros(n_features)
        self.bias=0

        for i in range(self.iterations):

            y_pred=np.dot(X,self.weights)+self.bias
    
            dw=(1/n_samples)*np.dot(X.T,(y_pred-y))
            db=(1/n_samples)*np.sum(y_pred-y)
    
            self.weights=self.weights-self.learning_rate*dw
            self.bias=self.bias-self.learning_rate*db

    def predict(self,X):
        y_predicted=np.dot(X,self.weights)+self.bias
        return y_predicted



In [151]:
model=LinearRegression(0.0001,1000)
model.fit(X_train,y_train)

In [152]:
y_pred=model.predict(X_test)

In [153]:
from sklearn.metrics import mean_squared_error as mse

In [154]:
mse(y_test,y_pred)

0.9183647378762976

In [155]:
from sklearn.metrics import r2_score as rs

In [156]:
rs(y_test,y_pred)

0.09243629768048933

In [157]:
from sklearn.linear_model import LinearRegression
regressor=LinearRegression()

In [158]:
regressor.fit(X_train,y_train)

In [159]:
mse(y_test,regressor.predict(X_test))

0.4695391522239564

In [160]:
rs(y_test,regressor.predict(X_test))

0.5359831733503054

In [161]:
regressor.get_params()

{'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'positive': False}

In [162]:
# calculating coefficients
coeff = pd.DataFrame(X_train.columns)
coeff['Coefficient Estimate'] = pd.Series(regressor.coef_)
coeff

Unnamed: 0,0,Coefficient Estimate
0,MedInc,0.725629
1,HouseAge,0.190825
2,AveRooms,-0.05268
3,Population,0.025745
4,AveOccup,-0.038423


In [163]:
regressor.score(X_test,y_test)

0.5359831733503054