In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import warnings
%matplotlib inline
warnings.filterwarnings('ignore')

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score
import statsmodels.api as sm

In [2]:
original_df = pd.read_csv(r'C:\Users\USER\Documents\Python\Nareshit data analysis\stats and ML\ML\26th- mlr\MLR\House_data.csv')

In [3]:
df = original_df.copy()
df.head(2)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639


In [4]:
df.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')

In [5]:
df.shape

(21613, 21)

In [6]:
# Initialize two lists
unique_cols = []
repeated_cols = []

# Check each column
for col in df.columns:
    if df[col].is_unique:
        unique_cols.append(col)
    else:
        repeated_cols.append(col)

# Print the results
print("Columns with unique values only:", unique_cols)
print("Columns with repeated values:", repeated_cols)

Columns with unique values only: []
Columns with repeated values: ['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15']


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21613 non-null  int64  
 1   date           21613 non-null  object 
 2   price          21613 non-null  float64
 3   bedrooms       21613 non-null  int64  
 4   bathrooms      21613 non-null  float64
 5   sqft_living    21613 non-null  int64  
 6   sqft_lot       21613 non-null  int64  
 7   floors         21613 non-null  float64
 8   waterfront     21613 non-null  int64  
 9   view           21613 non-null  int64  
 10  condition      21613 non-null  int64  
 11  grade          21613 non-null  int64  
 12  sqft_above     21613 non-null  int64  
 13  sqft_basement  21613 non-null  int64  
 14  yr_built       21613 non-null  int64  
 15  yr_renovated   21613 non-null  int64  
 16  zipcode        21613 non-null  int64  
 17  lat            21613 non-null  float64
 18  long  

In [8]:
df.isnull().sum()

id               0
date             0
price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
zipcode          0
lat              0
long             0
sqft_living15    0
sqft_lot15       0
dtype: int64

In [9]:
df_numeric = df.select_dtypes(include='number')
df_numeric.corr()['price'].sort_values(ascending=False)

price            1.000000
sqft_living      0.702035
grade            0.667434
sqft_above       0.605567
sqft_living15    0.585379
bathrooms        0.525138
view             0.397293
sqft_basement    0.323816
bedrooms         0.308350
lat              0.307003
waterfront       0.266369
floors           0.256794
yr_renovated     0.126434
sqft_lot         0.089661
sqft_lot15       0.082447
yr_built         0.054012
condition        0.036362
long             0.021626
id              -0.016762
zipcode         -0.053203
Name: price, dtype: float64

In [10]:
# Drop columns.
df = df.drop(['id','date'],axis=1)

In [11]:
# Segrigation of DV and IV
x = df.drop(columns='price',axis=1)
y=df[['price']]

In [12]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=0)

## 1. Linear Model

In [13]:

lin_model = LinearRegression()
lin_model.fit(x_train, y_train)

In [14]:
# Predict
y_pred = lin_model.predict(x_test)
actual = y_test.values.ravel()
predicted = y_pred.ravel()

test_pred = pd.DataFrame({
    'Actual price': actual,
    'Predicted price': predicted
})

test_pred['Error'] = test_pred['Actual price'] - test_pred['Predicted price']
test_pred['Absolute Error'] = test_pred['Error'].abs()

# Optional: Format to suppress scientific notation
pd.set_option('display.float_format', '{:,.2f}'.format)
print(test_pred.head())

   Actual price  Predicted price       Error  Absolute Error
0    297,000.00       378,448.22  -81,448.22       81,448.22
1  1,578,000.00     1,539,749.37   38,250.63       38,250.63
2    562,100.00       544,459.90   17,640.10       17,640.10
3    631,500.00       577,803.94   53,696.06       53,696.06
4    780,000.00       979,922.93 -199,922.93      199,922.93


In [15]:
# Evaluation
from sklearn.metrics import r2_score, mean_squared_error
print("R2 Score:", r2_score(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("Bias score:",lin_model.score(x_train, y_train))
print('Variance score:',lin_model.score(x_test, y_test))

R2 Score: 0.6949310095117811
MSE: 36280106778.877686
RMSE: 190473.3755118486
Bias score: 0.7005349311144583
Variance score: 0.6949310095117811


## OLS

In [16]:
cols = list(x_train.columns)
pmax = 1
while len(cols) > 0:
    X_1 = sm.add_constant(x_train[cols])
    model = sm.OLS(y_train, X_1).fit()
    p_values = model.pvalues.iloc[1:]  # exclude intercept
    pmax = p_values.max()
    feature_with_p_max = p_values.idxmax()
    if pmax > 0.05:
        cols.remove(feature_with_p_max)
    else:
        break

In [17]:
selected_features = cols
print("Selected Features:", selected_features)

Selected Features: ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15']


In [18]:
ols_model = sm.OLS(y_train, X_1[selected_features]).fit()
print(ols_model.summary())

                                 OLS Regression Results                                
Dep. Variable:                  price   R-squared (uncentered):                   0.904
Model:                            OLS   Adj. R-squared (uncentered):              0.904
Method:                 Least Squares   F-statistic:                          1.018e+04
Date:                Sat, 05 Apr 2025   Prob (F-statistic):                        0.00
Time:                        17:07:43   Log-Likelihood:                     -2.3590e+05
No. Observations:               17290   AIC:                                  4.718e+05
Df Residuals:                   17274   BIC:                                  4.720e+05
Df Model:                          16                                                  
Covariance Type:            nonrobust                                                  
                    coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------

In [19]:
## Step 8: Build Final Model using selected features
final_model = LinearRegression()
final_model.fit(x_train[selected_features], y_train)

In [20]:
# Predict
y_pred = final_model.predict(x_test[selected_features])
actual = y_test.values.ravel()
predicted = y_pred.ravel()

test_pred = pd.DataFrame({
    'Actual price': actual,
    'Predicted price': predicted
})

test_pred['Error'] = test_pred['Actual price'] - test_pred['Predicted price']
test_pred['Absolute Error'] = test_pred['Error'].abs()
# Optional: Format to suppress scientific notation
pd.set_option('display.float_format', '{:,.2f}'.format)

print(test_pred.head())

   Actual price  Predicted price       Error  Absolute Error
0    297,000.00       368,769.93  -71,769.93       71,769.93
1  1,578,000.00     1,541,859.35   36,140.65       36,140.65
2    562,100.00       544,498.30   17,601.70       17,601.70
3    631,500.00       579,240.95   52,259.05       52,259.05
4    780,000.00       981,753.69 -201,753.69      201,753.69


In [21]:
# Evaluation
print("R2 Score:", r2_score(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("Bias score:",final_model.score(x_train[selected_features], y_train))
print('Variance score:',final_model.score(x_test[selected_features], y_test))

R2 Score: 0.694932184042933
MSE: 36279967098.6187
RMSE: 190473.00884539704
Bias score: 0.7004769312123038
Variance score: 0.694932184042933


## 2. Ridge

In [22]:
from sklearn.linear_model import RidgeCV

# Define a range of alpha values to test
alpha_values = np.linspace(0.01, 100, 10000)

# RidgeCV automatically selects the best alpha
ridge_cv = RidgeCV(alphas=alpha_values, store_cv_results=True)
ridge_cv.fit(x_train, y_train)

# Best alpha selected
best_alpha = ridge_cv.alpha_
print("Optimal alpha:", best_alpha)

Optimal alpha: 0.92


In [23]:
ridge_model1 = Ridge(alpha=0.92)
ridge_model1.fit(x_train,y_train)

In [24]:
# Predict
y_pred = ridge_model1.predict(x_test)
actual = y_test.values.ravel()
predicted = y_pred.ravel()

test_pred = pd.DataFrame({
    'Actual price': actual,
    'Predicted price': predicted
})

test_pred['Error'] = test_pred['Actual price'] - test_pred['Predicted price']
test_pred['Absolute Error'] = test_pred['Error'].abs()
# Optional: Format to suppress scientific notation
pd.set_option('display.float_format', '{:,.2f}'.format)
print(test_pred.head())

   Actual price  Predicted price       Error  Absolute Error
0    297,000.00       378,190.67  -81,190.67       81,190.67
1  1,578,000.00     1,539,504.83   38,495.17       38,495.17
2    562,100.00       544,310.62   17,789.38       17,789.38
3    631,500.00       577,701.45   53,798.55       53,798.55
4    780,000.00       979,948.79 -199,948.79      199,948.79


In [25]:
# Evaluation

print("R2 Score:", r2_score(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("Bias score:",ridge_model1.score(x_train, y_train))
print('Variance score:',ridge_model1.score(x_test, y_test))

R2 Score: 0.6949963493705137
MSE: 36272336283.92198
RMSE: 190452.97656881603
Bias score: 0.700533257015671
Variance score: 0.6949963493705137


In [26]:
ridge_model = Ridge(alpha=0.3)
ridge_model.fit(x_train,y_train)

In [27]:
# Predict
y_pred = ridge_model.predict(x_test)
actual = y_test.values.ravel()
predicted = y_pred.ravel()

test_pred = pd.DataFrame({
    'Actual price': actual,
    'Predicted price': predicted
})

test_pred['Error'] = test_pred['Actual price'] - test_pred['Predicted price']
test_pred['Absolute Error'] = test_pred['Error'].abs()
# Optional: Format to suppress scientific notation
pd.set_option('display.float_format', '{:,.2f}'.format)
print(test_pred.head())

   Actual price  Predicted price       Error  Absolute Error
0    297,000.00       378,364.06  -81,364.06       81,364.06
1  1,578,000.00     1,539,669.37   38,330.63       38,330.63
2    562,100.00       544,411.09   17,688.91       17,688.91
3    631,500.00       577,770.41   53,729.59       53,729.59
4    780,000.00       979,931.85 -199,931.85      199,931.85


In [28]:
# Evaluation

print("R2 Score:", r2_score(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("Bias score:",ridge_model.score(x_train, y_train))
print('Variance score:',ridge_model.score(x_test, y_test))

R2 Score: 0.6949527684433526
MSE: 36277519113.83988
RMSE: 190466.58266961132
Bias score: 0.7005347515224238
Variance score: 0.6949527684433526


## 3. Lasso

In [29]:
from sklearn.linear_model import LassoCV

# Generate a range of alpha values from 0.01 to 100
alpha_values = np.linspace(0.01, 100, 1000)

# LassoCV automatically selects the best alpha
lasso_cv = LassoCV(alphas=alpha_values, cv=5)  # 5-fold cross-validation
lasso_cv.fit(x_train, y_train.values.ravel())

# Best alpha selected
best_alpha = lasso_cv.alpha_
print("Optimal alpha for Lasso:", best_alpha)

Optimal alpha for Lasso: 9.118198198198197


In [30]:
lasso_model1 = Lasso(alpha=9.118198198198197)
lasso_model1.fit(x_train,y_train)

In [31]:
# Predict
y_pred = lasso_model1.predict(x_test)
actual = y_test.values.ravel()
predicted = y_pred.ravel()

test_pred = pd.DataFrame({
    'Actual price': actual,
    'Predicted price': predicted
})

test_pred['Error'] = test_pred['Actual price'] - test_pred['Predicted price']
test_pred['Absolute Error'] = test_pred['Error'].abs()
# Optional: Format to suppress scientific notation
pd.set_option('display.float_format', '{:,.2f}'.format)
print(test_pred.head())

   Actual price  Predicted price       Error  Absolute Error
0    297,000.00       378,316.94  -81,316.94       81,316.94
1  1,578,000.00     1,539,601.71   38,398.29       38,398.29
2    562,100.00       544,420.11   17,679.89       17,679.89
3    631,500.00       577,741.13   53,758.87       53,758.87
4    780,000.00       979,921.37 -199,921.37      199,921.37


In [32]:
# Evaluation

print("R2 Score:", r2_score(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("Bias score:",lasso_model1.score(x_train, y_train))
print('Variance score:',lasso_model1.score(x_test, y_test))

R2 Score: 0.6949504596903078
MSE: 36277793680.60872
RMSE: 190467.30344237227
Bias score: 0.7005347530491663
Variance score: 0.6949504596903078


In [33]:
lasso_model = Lasso(alpha=0.1)
lasso_model.fit(x_train,y_train)

print(f'The coefficients are {lasso_model.coef_}')
print(f'The intercept is {lasso_model.intercept_}')

The coefficients are [-3.46741806e+04  3.94306711e+04  2.87966319e+02  1.75884796e-01
  7.42448422e+03  6.05586370e+05  5.27374002e+04  2.61691928e+04
  9.35904106e+04 -1.02568308e+02 -1.35532690e+02 -2.60714982e+03
  2.01053747e+01 -5.67454577e+02  6.04859517e+05 -2.22383006e+05
  2.42330713e+01 -4.67454659e-01]
The intercept is [4166172.13051429]


In [34]:
# Predict
y_pred = lasso_model.predict(x_test)
actual = y_test.values.ravel()
predicted = y_pred.ravel()

test_pred = pd.DataFrame({
    'Actual price': actual,
    'Predicted price': predicted
})

test_pred['Error'] = test_pred['Actual price'] - test_pred['Predicted price']
test_pred['Absolute Error'] = test_pred['Error'].abs()
# Optional: Format to suppress scientific notation
pd.set_option('display.float_format', '{:,.2f}'.format)
print(test_pred.head())

   Actual price  Predicted price       Error  Absolute Error
0    297,000.00       378,446.78  -81,446.78       81,446.78
1  1,578,000.00     1,539,747.75   38,252.25       38,252.25
2    562,100.00       544,459.47   17,640.53       17,640.53
3    631,500.00       577,803.25   53,696.75       53,696.75
4    780,000.00       979,922.91 -199,922.91      199,922.91


In [35]:
# Evaluation

print("R2 Score:", r2_score(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("Bias score:",lasso_model.score(x_train, y_train))
print('Variance score:',lasso_model.score(x_test, y_test))

R2 Score: 0.6949312247370665
MSE: 36280081183.36791
RMSE: 190473.30832263065
Bias score: 0.7005349310930411
Variance score: 0.6949312247370665


## New process by elimination of feature on the basis of .corr() function

In [36]:
df_numeric = df.select_dtypes(include='number')
corr_with_price=df_numeric.corr()['price'].sort_values(ascending=False)

In [37]:
# Exlcude feature having corr value less than 0.1
selected_features = corr_with_price[corr_with_price > 0.1].index.drop('price').tolist()
print("Selected Features:", selected_features)

Selected Features: ['sqft_living', 'grade', 'sqft_above', 'sqft_living15', 'bathrooms', 'view', 'sqft_basement', 'bedrooms', 'lat', 'waterfront', 'floors', 'yr_renovated']


In [38]:
# Taking only those feature having corr value mor than 0.1
x_selected =  x[selected_features]

In [39]:
X_train, X_test, Y_train, Y_test = train_test_split(x_selected,y,test_size=0.3, random_state=0)

## Linear model

In [40]:
lin_model_n = LinearRegression()
lin_model_n.fit(X_train, Y_train)

In [41]:
# Predict
Y_pred = lin_model_n.predict(X_test)
actual = Y_test.values.ravel()
predicted = Y_pred.ravel()

test_pred = pd.DataFrame({
    'Actual price': actual,
    'Predicted price': predicted
})

test_pred['Error'] = test_pred['Actual price'] - test_pred['Predicted price']
test_pred['Absolute Error'] = test_pred['Error'].abs()
# Optional: Format to suppress scientific notation
pd.set_option('display.float_format', '{:,.2f}'.format)
print(test_pred.head())

   Actual price  Predicted price       Error  Absolute Error
0    297,000.00       443,037.44 -146,037.44      146,037.44
1  1,578,000.00     1,393,667.47  184,332.53      184,332.53
2    562,100.00       464,322.24   97,777.76       97,777.76
3    631,500.00       471,155.86  160,344.14      160,344.14
4    780,000.00     1,076,630.46 -296,630.46      296,630.46


## Linear OLS model

In [42]:
import statsmodels.api as sm
cols = list(X_train.columns)
pmax = 1
while len(cols) > 0:
    X_1 = sm.add_constant(X_train[cols])
    model = sm.OLS(Y_train, X_1).fit()
    p_values = model.pvalues.iloc[1:]  # exclude intercept
    pmax = p_values.max()
    feature_with_p_max = p_values.idxmax()
    if pmax > 0.05:
        cols.remove(feature_with_p_max)
    else:
        break

In [43]:
selected_features = cols
print("Selected Features:", selected_features)

Selected Features: ['sqft_living', 'grade', 'sqft_above', 'sqft_living15', 'bathrooms', 'view', 'sqft_basement', 'bedrooms', 'lat', 'waterfront', 'floors', 'yr_renovated']


In [44]:
selected_features = ['sqft_living', 'grade', 'sqft_above', 'sqft_living15', 'bathrooms', 'view', 'sqft_basement', 'bedrooms', 'lat', 'waterfront', 'yr_renovated']
ols_model = sm.OLS(Y_train, X_train[selected_features]).fit()
print(ols_model.summary())

                                 OLS Regression Results                                
Dep. Variable:                  price   R-squared (uncentered):                   0.875
Model:                            OLS   Adj. R-squared (uncentered):              0.875
Method:                 Least Squares   F-statistic:                          1.062e+04
Date:                Sat, 05 Apr 2025   Prob (F-statistic):                        0.00
Time:                        17:09:56   Log-Likelihood:                     -2.0827e+05
No. Observations:               15129   AIC:                                  4.166e+05
Df Residuals:                   15119   BIC:                                  4.166e+05
Df Model:                          10                                                  
Covariance Type:            nonrobust                                                  
                    coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------

In [45]:
lin_model_n_1 = LinearRegression()
lin_model_n_1.fit(X_train[selected_features], Y_train)

In [46]:
# Predict
Y_pred = lin_model_n_1.predict(X_test[selected_features])
actual = Y_test.values.ravel()
predicted = Y_pred.ravel()

test_pred = pd.DataFrame({
    'Actual price': actual,
    'Predicted price': predicted
})

test_pred['Error'] = test_pred['Actual price'] - test_pred['Predicted price']
test_pred['Absolute Error'] = test_pred['Error'].abs()
# Optional: Format to suppress scientific notation
pd.set_option('display.float_format', '{:,.2f}'.format)
print(test_pred.head())

   Actual price  Predicted price       Error  Absolute Error
0    297,000.00       477,815.63 -180,815.63      180,815.63
1  1,578,000.00     1,383,684.53  194,315.47      194,315.47
2    562,100.00       462,315.21   99,784.79       99,784.79
3    631,500.00       463,665.08  167,834.92      167,834.92
4    780,000.00     1,075,909.18 -295,909.18      295,909.18


In [47]:
# Evaluation

print("R2 Score:", r2_score(Y_test, Y_pred))
print("MSE:", mean_squared_error(Y_test, Y_pred))
print("RMSE:", np.sqrt(mean_squared_error(Y_test, Y_pred)))
print("Bias score:",lin_model_n_1.score(X_train[selected_features], Y_train))
print('Variance score:',lin_model_n_1.score(X_test[selected_features], Y_test))

R2 Score: 0.6478033736754233
MSE: 48353762187.976685
RMSE: 219894.88895373783
Bias score: 0.6660892593156131
Variance score: 0.6478033736754233


## Ridge

In [48]:
ridge_model_n = Ridge(alpha=0.92)
ridge_model_n.fit(X_train,Y_train)

In [49]:
# Predict
Y_pred = ridge_model_n.predict(X_test)
actual = Y_test.values.ravel()
predicted = Y_pred.ravel()

test_pred = pd.DataFrame({
    'Actual price': actual,
    'Predicted price': predicted
})

test_pred['Error'] = test_pred['Actual price'] - test_pred['Predicted price']
test_pred['Absolute Error'] = test_pred['Error'].abs()
# Optional: Format to suppress scientific notation
pd.set_option('display.float_format', '{:,.2f}'.format)
print(test_pred.head())

   Actual price  Predicted price       Error  Absolute Error
0    297,000.00       442,806.67 -145,806.67      145,806.67
1  1,578,000.00     1,393,514.55  184,485.45      184,485.45
2    562,100.00       464,097.08   98,002.92       98,002.92
3    631,500.00       470,978.36  160,521.64      160,521.64
4    780,000.00     1,076,627.50 -296,627.50      296,627.50


In [50]:
# Evaluation

print("R2 Score:", r2_score(Y_test, Y_pred))
print("MSE:", mean_squared_error(Y_test, Y_pred))
print("RMSE:", np.sqrt(mean_squared_error(Y_test, Y_pred)))
print("Bias score:",ridge_model_n.score(X_train, Y_train))
print('Variance score:',ridge_model_n.score(X_test, Y_test))

R2 Score: 0.6495193449859765
MSE: 48118173137.79818
RMSE: 219358.54926990691
Bias score: 0.6667117280829296
Variance score: 0.6495193449859765
