In [2]:
#Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split, cross_val_score
from statistics import mean

In [3]:
#Read input data
df = pd.read_csv('kc_house_data.csv')

In [4]:
#Obtain first 10 rows
df.head(10)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180.0,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170.0,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770.0,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050.0,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680.0,0,1987,0,98074,47.6168,-122.045,1800,7503
5,7237550310,20140512T000000,1230000.0,4,4.5,5420,101930,1.0,0,0,...,11,3890.0,1530,2001,0,98053,47.6561,-122.005,4760,101930
6,1321400060,20140627T000000,257500.0,3,2.25,1715,6819,2.0,0,0,...,7,1715.0,0,1995,0,98003,47.3097,-122.327,2238,6819
7,2008000270,20150115T000000,291850.0,3,1.5,1060,9711,1.0,0,0,...,7,1060.0,0,1963,0,98198,47.4095,-122.315,1650,9711
8,2414600126,20150415T000000,229500.0,3,1.0,1780,7470,1.0,0,0,...,7,1050.0,730,1960,0,98146,47.5123,-122.337,1780,8113
9,3793500160,20150312T000000,323000.0,3,2.5,1890,6560,2.0,0,0,...,7,1890.0,0,2003,0,98038,47.3684,-122.031,2390,7570


In [5]:
df.shape

(21613, 21)

In [6]:
#Check whether the data is numerical
df.dtypes

id                 int64
date              object
price            float64
bedrooms           int64
bathrooms        float64
sqft_living        int64
sqft_lot           int64
floors           float64
waterfront         int64
view               int64
condition          int64
grade              int64
sqft_above       float64
sqft_basement      int64
yr_built           int64
yr_renovated       int64
zipcode            int64
lat              float64
long             float64
sqft_living15      int64
sqft_lot15         int64
dtype: object

In [7]:
#Check for empty data
df.isnull().sum()

id               0
date             0
price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_above       2
sqft_basement    0
yr_built         0
yr_renovated     0
zipcode          0
lat              0
long             0
sqft_living15    0
sqft_lot15       0
dtype: int64

In [8]:
#Drop the irrelevant features
df.drop(['id', 'date', 'zipcode'], axis=1, inplace=True)

In [9]:
df.head(10)

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,lat,long,sqft_living15,sqft_lot15
0,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180.0,0,1955,0,47.5112,-122.257,1340,5650
1,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,2170.0,400,1951,1991,47.721,-122.319,1690,7639
2,180000.0,2,1.0,770,10000,1.0,0,0,3,6,770.0,0,1933,0,47.7379,-122.233,2720,8062
3,604000.0,4,3.0,1960,5000,1.0,0,0,5,7,1050.0,910,1965,0,47.5208,-122.393,1360,5000
4,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,1680.0,0,1987,0,47.6168,-122.045,1800,7503
5,1230000.0,4,4.5,5420,101930,1.0,0,0,3,11,3890.0,1530,2001,0,47.6561,-122.005,4760,101930
6,257500.0,3,2.25,1715,6819,2.0,0,0,3,7,1715.0,0,1995,0,47.3097,-122.327,2238,6819
7,291850.0,3,1.5,1060,9711,1.0,0,0,3,7,1060.0,0,1963,0,47.4095,-122.315,1650,9711
8,229500.0,3,1.0,1780,7470,1.0,0,0,3,7,1050.0,730,1960,0,47.5123,-122.337,1780,8113
9,323000.0,3,2.5,1890,6560,2.0,0,0,3,7,1890.0,0,2003,0,47.3684,-122.031,2390,7570


#### Since the data has only 2 missing values, they have been dropped

In [10]:
#Remove the rows with empty data
df.dropna(axis=0, how='any', inplace=True)

In [11]:
df.isnull().sum()

price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
lat              0
long             0
sqft_living15    0
sqft_lot15       0
dtype: int64

In [12]:
#correlation matrix
df.corr()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,lat,long,sqft_living15,sqft_lot15
price,1.0,0.308355,0.525146,0.702064,0.089656,0.256806,0.266332,0.397352,0.036409,0.667463,0.605565,0.323864,0.053977,0.126445,0.306925,0.021557,0.585372,0.082456
bedrooms,0.308355,1.0,0.515974,0.576763,0.03171,0.175418,-0.006581,0.079537,0.028433,0.356998,0.477616,0.303251,0.154248,0.018844,-0.00895,0.129516,0.39167,0.029252
bathrooms,0.525146,0.515974,1.0,0.754684,0.08773,0.500712,0.063743,0.187735,-0.124917,0.664981,0.685363,0.283737,0.505968,0.050733,0.024619,0.222987,0.568626,0.087163
sqft_living,0.702064,0.576763,0.754684,1.0,0.172841,0.354048,0.103829,0.284647,-0.058689,0.762727,0.876644,0.434925,0.318066,0.055377,0.05253,0.240187,0.75644,0.183301
sqft_lot,0.089656,0.03171,0.08773,0.172841,1.0,-0.005206,0.021602,0.074705,-0.008951,0.113617,0.183511,0.015301,0.053061,0.00764,-0.085673,0.229519,0.144605,0.718556
floors,0.256806,0.175418,0.500712,0.354048,-0.005206,1.0,0.023695,0.029432,-0.263808,0.458208,0.523899,-0.245634,0.489361,0.00633,0.049628,0.125446,0.279907,-0.011275
waterfront,0.266332,-0.006581,0.063743,0.103829,0.021602,0.023695,1.0,0.401857,0.016655,0.082775,0.072074,0.080618,-0.026172,0.092883,-0.01427,-0.041913,0.086463,0.030702
view,0.397352,0.079537,0.187735,0.284647,0.074705,0.029432,0.401857,1.0,0.045995,0.25132,0.167648,0.277051,-0.053474,0.103912,0.006172,-0.078408,0.28044,0.072569
condition,0.036409,0.028433,-0.124917,-0.058689,-0.008951,-0.263808,0.016655,0.045995,1.0,-0.144647,-0.158206,0.174273,-0.361384,-0.060617,-0.014965,-0.106453,-0.092795,-0.003397
grade,0.667463,0.356998,0.664981,0.762727,0.113617,0.458208,0.082775,0.25132,-0.144647,1.0,0.755924,0.168375,0.446958,0.014412,0.114102,0.198349,0.713197,0.119243


In [13]:
#Select independent and dependent features
X = df.drop('price', axis=1)
Y = df['price']

In [14]:
#split the data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [15]:
#Apply linear regression
lr = LinearRegression()
lr.fit(X_train, Y_train)

In [16]:
#Returns r2 score for training data
lr.score(X_train, Y_train)

0.691934916961433

In [17]:
#apply ridge regression with different values of alpha
alpha = []
scores = []
#Running a loop to evaluate the performance of the model for different values of alpha.
#The value of alpha can be modified. This isn't a rule.
for i in range(1, 21):
    rm = Ridge(alpha=i*0.01)
    rm.fit(X_train, Y_train)
    #Perform cross validation to check how well the model performs on unseen data. cv parameter helps in dividing the data.
    score = cross_val_score(rm, X_train, Y_train, cv=10)
    #converting score into percentage
    avg_score = mean(score) * 100
    scores.append(avg_score)
    alpha.append(i*0.01)

In [18]:
ridge_df = pd.DataFrame({'alpha':alpha, 'score':scores})
ridge_df

Unnamed: 0,alpha,score
0,0.01,68.858746
1,0.02,68.858748
2,0.03,68.85875
3,0.04,68.858752
4,0.05,68.858754
5,0.06,68.858756
6,0.07,68.858757
7,0.08,68.858759
8,0.09,68.858761
9,0.1,68.858763


In [19]:
rm_best = Ridge(alpha=0.2)
rm_best.fit(X_train, Y_train)

In [20]:
rm_best.score(X_train, Y_train)

0.6919348506648952

In [21]:
#Returns r2-score
rm_best.score(X_test, Y_test)

0.7063790389189295

In [22]:
lr.score(X_test, Y_test)

0.7064058524234053

In [23]:
cross_val_scores_lasso = []

# List to maintain the different values of alpha
alpha = []

# Loop to compute the cross-validation scores
for i in range(1, 9):
    #tol - tolerance. It is a parameter in Lasso Regression which can be used as a benchmark to stop the regularization process. We use regualrization
    # to reduce the cost function. When the difference in cost function between iterations is less than the tol value mentioned, the process stops.
    lm = Lasso(alpha = i * 0.25, tol = 0.0925)
    lm.fit(X_train, Y_train)
    score = cross_val_score(lm, X_train, Y_train, cv = 10)
    #converting score into percentage
    avg_score = mean(score)*100
    cross_val_scores_lasso.append(avg_score)
    alpha.append(i * 0.25)

In [24]:
lasso_df = pd.DataFrame({'alpha':alpha, 'score':cross_val_scores_lasso})
lasso_df

Unnamed: 0,alpha,score
0,0.25,68.858749
1,0.5,68.858754
2,0.75,68.858759
3,1.0,68.858764
4,1.25,68.858768
5,1.5,68.858773
6,1.75,68.858778
7,2.0,68.858782


In [25]:
#Best score is obtained at i=8 i.e alpha=2
lm_best = Lasso(alpha=2, tol=0.0925)
lm_best.fit(X_train, Y_train)

In [26]:
#Returns r2-score for lasso regression
lm_best.score(X_test, Y_test)

0.706397012010829

In [27]:
lr.score(X_test, Y_test)

0.7064058524234053

### Conclusion: Ridge and Lasso Regression didn't improve the model performance. This could be because these techniques are sensitive to scaling differences. Try scaling and check if there is any model improvement!