In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('train.csv')

In [3]:
df.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [4]:
y = df['SalePrice'].values

In [5]:
df['SaleCondition'].unique()

array(['Normal', 'Abnorml', 'Partial', 'AdjLand', 'Alloca', 'Family'],
      dtype=object)

In [6]:
df.shape

(1460, 81)

In [7]:
df.dtypes.unique()

array([dtype('int64'), dtype('O'), dtype('float64')], dtype=object)

In [8]:
df.select_dtypes(include='int').shape

(1460, 35)

In [9]:
df.select_dtypes(include='object').shape

(1460, 43)

In [10]:
df.select_dtypes(include='float64').shape

(1460, 3)

In [11]:
df.select_dtypes(include='object').head()

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1,RL,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
2,RL,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
3,RL,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
4,RL,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal


Dataset of houses based on their square footage and the number of bedrooms and bathrooms.

In [12]:
X = df[['LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF','1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd']]

In [13]:
X.isna().sum()

LotArea          0
MasVnrArea       8
BsmtFinSF1       0
BsmtFinSF2       0
BsmtUnfSF        0
TotalBsmtSF      0
1stFlrSF         0
2ndFlrSF         0
LowQualFinSF     0
GrLivArea        0
GarageArea       0
WoodDeckSF       0
OpenPorchSF      0
EnclosedPorch    0
3SsnPorch        0
ScreenPorch      0
PoolArea         0
BsmtFullBath     0
BsmtHalfBath     0
FullBath         0
BedroomAbvGr     0
KitchenAbvGr     0
TotRmsAbvGrd     0
dtype: int64

In [14]:
X = X.fillna(0)
X.isna().sum()

LotArea          0
MasVnrArea       0
BsmtFinSF1       0
BsmtFinSF2       0
BsmtUnfSF        0
TotalBsmtSF      0
1stFlrSF         0
2ndFlrSF         0
LowQualFinSF     0
GrLivArea        0
GarageArea       0
WoodDeckSF       0
OpenPorchSF      0
EnclosedPorch    0
3SsnPorch        0
ScreenPorch      0
PoolArea         0
BsmtFullBath     0
BsmtHalfBath     0
FullBath         0
BedroomAbvGr     0
KitchenAbvGr     0
TotRmsAbvGrd     0
dtype: int64

#Splitting the dataset

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2, random_state = 0)

#Linear Regression

In [16]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(X_train, y_train)

In [17]:
reg.score(X_train, y_train)

0.7907042708833473

In [18]:
reg.intercept_

30299.629941625928

In [19]:
y_pred = reg.predict(X_valid)

In [20]:
X_train.size

26864

In [21]:
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_valid.reshape(len(y_valid),1)),1))

[[254738.3  200624.  ]
 [130472.94 133000.  ]
 [107295.02 110000.  ]
 [230832.55 192000.  ]
 [108230.64  88000.  ]
 [107523.53  85000.  ]
 [231846.91 282922.  ]
 [142131.82 141000.  ]
 [512135.28 745000.  ]
 [133205.66 148800.  ]
 [183020.67 208900.  ]
 [156177.86 136905.  ]
 [246435.88 225000.  ]
 [118727.5  123000.  ]
 [137195.85 119200.  ]
 [151111.27 145000.  ]
 [224987.78 190000.  ]
 [ 93223.73 123600.  ]
 [127404.19 149350.  ]
 [142060.61 155000.  ]
 [163215.6  166000.  ]
 [143260.5  144500.  ]
 [120006.2  110000.  ]
 [147805.66 174000.  ]
 [208701.92 185000.  ]
 [162761.87 168000.  ]
 [165967.79 177500.  ]
 [ 67752.35  84500.  ]
 [286662.21 320000.  ]
 [138064.14 118500.  ]
 [192362.92 110000.  ]
 [195329.07 213000.  ]
 [139228.01 156000.  ]
 [281277.33 250000.  ]
 [295407.74 372500.  ]
 [170187.36 175000.  ]
 [251352.01 277500.  ]
 [106082.91 112500.  ]
 [213292.99 263000.  ]
 [330260.83 325000.  ]
 [217601.53 243000.  ]
 [124025.42 130000.  ]
 [163029.5  164990.  ]
 [279002.25

Mean Squared Error

In [22]:
from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_valid, y_pred)
r_squared = r2_score(y_valid, y_pred)
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R2): {r_squared:.2f}")

Mean Squared Error (MSE): 3261444666.53
R-squared (R2): 0.53


In [23]:
X_train.iloc[17]

LotArea          9947.0
MasVnrArea          0.0
BsmtFinSF1        611.0
BsmtFinSF2          0.0
BsmtUnfSF         577.0
TotalBsmtSF      1188.0
1stFlrSF         1217.0
2ndFlrSF            0.0
LowQualFinSF        0.0
GrLivArea        1217.0
GarageArea        497.0
WoodDeckSF        168.0
OpenPorchSF        27.0
EnclosedPorch       0.0
3SsnPorch           0.0
ScreenPorch         0.0
PoolArea            0.0
BsmtFullBath        1.0
BsmtHalfBath        0.0
FullBath            2.0
BedroomAbvGr        3.0
KitchenAbvGr        1.0
TotRmsAbvGrd        6.0
Name: 186, dtype: float64

In [24]:
new_data = np.array([[9947.0, 0.0, 611.0, 0.0, 577.0, 1188.0, 1217.0, 0.0, 0.0, 1217.0, 497.0, 168.0, 27.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 2.0, 3.0, 1.0, 6.0]])
predicted_price = reg.predict(new_data)
print(f"Predicted Price for new data: ${predicted_price[0]:,.2f}")

Predicted Price for new data: $178,129.33


