In [102]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as plt
import plotly.express as px
import plotly.graph_objects as go

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [57]:
# loading the data
train_data  = pd.read_csv("train.csv")
test_data  = pd.read_csv("test.csv")

In [58]:
# Understanding train data
train_data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [59]:
# Understanding test data
test_data
# here we are missing with sales price which we have it in training data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [64]:
# getting more into what columns are
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1201 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   Alley          91 non-null     object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuil

In [63]:
# we can drop ID as I think its of no use 
train_data.drop(columns=["Id"],inplace=True)

In [65]:
# describing data
train_data.describe()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,46.549315,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,161.319273,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,0.0,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,0.0,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,1474.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [None]:
# creating pair plot of data to understand it in more intuitive way
fig = px.histogram(
    train_data,
    x="SalePrice",
    nbins=100,
    title="Distribution of Sales Price of Houses",
    opacity=0.5)
fig.update_layout(
    xaxis_title="Values",
    yaxis_title="Frequency",
    height=500,
    bargap=0.1,
    template="plotly_white" )

fig.show()


In [112]:
# checking outliers 
box_plot = px.box(train_data, x = "SalePrice", title="Box plot to check Outliers")
box_plot.show()

In [133]:
# I need to check how different features are effecting the sales price for that I'll find correlation wrt the sales price
target = "SalePrice"
correlation_scores_numerical = {}
correlation_scores_categorical = {}
numerical_columns = train_data.select_dtypes(include=["int64","float64"]).columns
categorical_columns = train_data.select_dtypes(include=["object"]).columns
for x in numerical_columns:
    if x != target:
        correlation = train_data[[x,target]].corr().iloc[0,1]
        correlation_scores_numerical[x] = round(float(correlation),2)

for x in categorical_columns:
    if x != target:
        encode = pd.get_dummies(train_data[x])
        temp = pd.concat([encode, train_data[target]],axis=1)
        correlation = temp.corr()[target].drop(target)
        correlation_scores_categorical[x] = round(float(correlation.abs().max()),2)
correlation_scores_numerical = dict(sorted(correlation_scores_numerical.items(), reverse=True, key = lambda x : x[1]))
correlation_scores_categorical = dict(sorted(correlation_scores_categorical.items(), reverse=True, key = lambda x : x[1]))
print(correlation_scores_numerical)
print(correlation_scores_categorical)

{'OverallQual': 0.79, 'GrLivArea': 0.71, 'GarageCars': 0.64, 'GarageArea': 0.62, 'TotalBsmtSF': 0.61, '1stFlrSF': 0.61, 'FullBath': 0.56, 'TotRmsAbvGrd': 0.53, 'YearBuilt': 0.52, 'YearRemodAdd': 0.51, 'GarageYrBlt': 0.49, 'MasVnrArea': 0.48, 'Fireplaces': 0.47, 'BsmtFinSF1': 0.39, 'LotFrontage': 0.35, '2ndFlrSF': 0.32, 'WoodDeckSF': 0.32, 'OpenPorchSF': 0.32, 'HalfBath': 0.28, 'LotArea': 0.26, 'BsmtFullBath': 0.23, 'BsmtUnfSF': 0.21, 'BedroomAbvGr': 0.17, 'ScreenPorch': 0.11, 'PoolArea': 0.09, 'MoSold': 0.05, '3SsnPorch': 0.04, 'BsmtFinSF2': -0.01, 'BsmtHalfBath': -0.02, 'MiscVal': -0.02, 'LowQualFinSF': -0.03, 'YrSold': -0.03, 'MSSubClass': -0.08, 'OverallCond': -0.08, 'EnclosedPorch': -0.13, 'KitchenAbvGr': -0.14}
{'ExterQual': 0.59, 'BsmtQual': 0.55, 'KitchenQual': 0.52, 'Foundation': 0.5, 'BsmtFinType1': 0.43, 'HeatingQC': 0.43, 'GarageFinish': 0.42, 'Neighborhood': 0.4, 'SaleType': 0.36, 'GarageType': 0.35, 'SaleCondition': 0.35, 'FireplaceQu': 0.34, 'MasVnrType': 0.33, 'Exterior1

In [140]:
explained_variance_numerical = []
explained_variance_categorical = []
for x in correlation_scores_numerical.values():
    variance = round(x**2 * 100,2)
    explained_variance_numerical.append(variance)
for x in correlation_scores_categorical.values():
    variance = round(x**2 * 100,2)
    explained_variance_categorical.append(variance)
print("Numerical Values Variance :",explained_variance_numerical)
print("Categorical Values Variance :",explained_variance_categorical)

Numerical Values Variance : [62.41, 50.41, 40.96, 38.44, 37.21, 37.21, 31.36, 28.09, 27.04, 26.01, 24.01, 23.04, 22.09, 15.21, 12.25, 10.24, 10.24, 10.24, 7.84, 6.76, 5.29, 4.41, 2.89, 1.21, 0.81, 0.25, 0.16, 0.01, 0.04, 0.04, 0.09, 0.09, 0.64, 0.64, 1.69, 1.96]
Categorical Values Variance : [34.81, 30.25, 27.04, 25.0, 18.49, 18.49, 17.64, 16.0, 12.96, 12.25, 12.25, 11.56, 10.89, 9.61, 9.61, 9.61, 8.41, 7.84, 7.29, 6.25, 6.25, 5.76, 5.76, 5.76, 5.29, 2.89, 1.96, 1.96, 1.96, 1.96, 1.96, 1.96, 1.69, 1.44, 1.44, 1.44, 1.44, 0.81, 0.49, 0.25, 0.25, 0.16, 0.01]


##### The square of the correlation (R²) represents the proportion of variance in the target variable explained by each feature.
##### we can see here that numerical values itself can give more than 95% variance  

In [103]:
# checking if there are null values here
train_data.isnull().sum()

MSSubClass          0
MSZoning            0
LotFrontage       259
LotArea             0
Street              0
Alley            1369
LotShape            0
LandContour         0
Utilities           0
LotConfig           0
LandSlope           0
Neighborhood        0
Condition1          0
Condition2          0
BldgType            0
HouseStyle          0
OverallQual         0
OverallCond         0
YearBuilt           0
YearRemodAdd        0
RoofStyle           0
RoofMatl            0
Exterior1st         0
Exterior2nd         0
MasVnrType        872
MasVnrArea          8
ExterQual           0
ExterCond           0
Foundation          0
BsmtQual           37
BsmtCond           37
BsmtExposure       38
BsmtFinType1       37
BsmtFinSF1          0
BsmtFinType2       38
BsmtFinSF2          0
BsmtUnfSF           0
TotalBsmtSF         0
Heating             0
HeatingQC           0
CentralAir          0
Electrical          1
1stFlrSF            0
2ndFlrSF            0
LowQualFinSF        0
GrLivArea 

##### lot of null values exist here and before doing anything lets see how model works around it

In [107]:
# dividing training data
X = train_data.drop(columns=["SalePrice"])
Y = train_data["SalePrice"]
x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size=0.25, random_state=1111)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(1095, 79)
(365, 79)
(1095,)
(365,)


In [141]:
LinearReg = LinearRegression()
LinearReg.fit(x_train, y_train)


ValueError: could not convert string to float: 'RL'

In [50]:
predict = LinearReg.predict(x_test)
# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, predict))
print("Root Mean Squared Error (RMSE):", rmse)

# Calculate MAE
mae = mean_absolute_error(y_test, predict)
print("Mean Absolute Error (MAE):", mae)

# Calculate MSE
mse = mean_squared_error(y_test, predict)
print("Mean Squared Error (MSE):", mse)

# Calculate R² Score
r2 = r2_score(y_test, predict)
print("R² Score:", r2)

Root Mean Squared Error (RMSE): 8.935619222462396e-12
Mean Absolute Error (MAE): 6.471126985233572e-12
Mean Squared Error (MSE): 7.984529088883947e-23
R² Score: 1.0


In [47]:
y_test

924     1520.74
598     1071.47
353     1025.90
978     1326.80
1192    2055.03
         ...   
97       784.54
1085    1486.02
633     1028.71
863     1334.87
654     1098.26
Name: close, Length: 252, dtype: float64

In [44]:
pipe = Pipeline([("scaler",StandardScaler()), 
                 ("elasticnet",ElasticNet())])
parameters = {"elasticnet__alpha": [0.001, 0.01, 0.1, 1, 10, 100],
              "elasticnet__l1_ratio": [0.10, 0.50, 0.75, 0.95],
              "elasticnet__max_iter": [1000, 5000,10000]}
grid_search = GridSearchCV(pipe, parameters, cv = 5, scoring="r2")
grid_search.fit(x_train, y_train)
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Best Parameters: {'elasticnet__alpha': 0.001, 'elasticnet__l1_ratio': 0.75, 'elasticnet__max_iter': 10000}
Best Score: 0.9999719451945115
