In [112]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

from sklearn import metrics
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet

pd.options.display.float_format = '{:.4f}'.format
import statsmodels.api as sm

**For this model I used the same columns from Model 2 and 

In [113]:
clean_data = pd.read_csv('../datasets/clean_df.csv')

In [114]:
clean_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051 entries, 0 to 2050
Data columns (total 82 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       2051 non-null   int64  
 1   Id               2051 non-null   int64  
 2   PID              2051 non-null   int64  
 3   MS SubClass      2051 non-null   int64  
 4   MS Zoning        2051 non-null   object 
 5   Lot Frontage     1721 non-null   float64
 6   Lot Area         2051 non-null   int64  
 7   Street           2051 non-null   object 
 8   Alley            140 non-null    object 
 9   Lot Shape        2051 non-null   object 
 10  Land Contour     2051 non-null   object 
 11  Utilities        2051 non-null   object 
 12  Lot Config       2051 non-null   object 
 13  Land Slope       2051 non-null   object 
 14  Neighborhood     2051 non-null   object 
 15  Condition 1      2051 non-null   object 
 16  Condition 2      2051 non-null   object 
 17  Bldg Type     

In [115]:
Columns_to_include = ['SalePrice', 
                      'Overall Qual', 
                      'Gr Liv Area', 
                      'Garage Area', 
                      'Garage Cars', 
                      'Total Bsmt SF', 
                      '1st Flr SF', 
                      'Year Built', 
                      'Full Bath', 
                      'Fireplaces', 
                      'MS SubClass', 
                      'Neighborhood', 
                      'Condition 1', 
                      'Exter Qual', 
                      'Bsmt Exposure', 
                      'Kitchen Qual', 
                      'Garage Qual',
                      'Exterior 1st',
                      'Exter Cond',
                      'Bsmt Qual',
                      'Bsmt Cond',
                      'BsmtFin Type 1',
                      'Functional',
                      'Fireplace Qu',
                      'Paved Drive',
                      'Sale Type',
                      'Garage Cond']

In [116]:
clean_columns = clean_data[Columns_to_include]

In [117]:
# Create a list of string-type column names
string_columns = clean_data.select_dtypes(include=['object']).columns

column_to_exclude = 'Mas Vnr Type'

# Use a for loop to input NA in each column in dataframe that is in list except Mas Vnr Type
for column in string_columns:
    if column != column_to_exclude:
        clean_data[column].fillna('NA', inplace=True)

In [118]:
X1 = clean_data[Columns_to_include].drop(columns='SalePrice')
y = clean_data['SalePrice']

In [119]:
X1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051 entries, 0 to 2050
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Overall Qual    2051 non-null   int64  
 1   Gr Liv Area     2051 non-null   int64  
 2   Garage Area     2050 non-null   float64
 3   Garage Cars     2050 non-null   float64
 4   Total Bsmt SF   2051 non-null   float64
 5   1st Flr SF      2051 non-null   int64  
 6   Year Built      2051 non-null   int64  
 7   Full Bath       2051 non-null   int64  
 8   Fireplaces      2051 non-null   int64  
 9   MS SubClass     2051 non-null   int64  
 10  Neighborhood    2051 non-null   object 
 11  Condition 1     2051 non-null   object 
 12  Exter Qual      2051 non-null   object 
 13  Bsmt Exposure   2051 non-null   object 
 14  Kitchen Qual    2051 non-null   object 
 15  Garage Qual     2051 non-null   object 
 16  Exterior 1st    2051 non-null   object 
 17  Exter Cond      2051 non-null   o

In [120]:
X1['Garage Qual'].value_counts()

TA    1832
NA     114
Fa      82
Gd      18
Ex       3
Po       2
Name: Garage Qual, dtype: int64

In [121]:
y.shape

(2051,)

In [122]:
X1_train, X1_val, y_train, y_val = train_test_split(X1, y, test_size= 0.2, random_state=24)

In [123]:
X1_str = list(X1.select_dtypes(include=['object']).columns)

In [124]:
#I can either add imputer to the column transformer
#or if I don't add polynomical features back in, I can just remove ct & run each separate
#
ct = ColumnTransformer([('oh', OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop='first'), X1_str)
], remainder='passthrough')  

# Fit and transform the training data using the ColumnTransformer
X1_train_transformed = ct.fit_transform(X1_train)

# Transform the test data using the fitted ColumnTransformer
X1_val_transformed = ct.transform(X1_val)


#Double-check this... why would I need an imputer? I shouldn't have any numeric columns with missings
imputer = SimpleImputer(strategy='median')
X1_train_transformed = imputer.fit_transform(X1_train_transformed)
X1_val_transofrmed = imputer.transform(X1_val_transformed)


In [125]:
lr=LinearRegression()

In [126]:
lr.fit(X1_train_transformed, y_train)

In [127]:
r2_train = lr.score(X1_train_transformed, y_train)
r2_val = lr.score(X1_val_transformed, y_val)
print(r2_train)
print(r2_val)

0.8874442837871672
0.9053777402809525


In [128]:
y_preds_train = lr.predict(X1_train_transformed)
metrics.mean_squared_error(y_train, y_preds_train)

715730948.6393995

In [129]:
y_preds_val = lr.predict(X1_val_transformed)
metrics.mean_squared_error(y_val, y_preds_val)

563890073.929552

In [130]:
lr.coef_

array([-6.64358966e+03,  8.30579429e+03, -5.34708609e+03,  1.47075007e+04,
        3.56753474e+03,  1.97997891e+04, -1.24230104e+04,  2.25747635e+03,
        5.14127847e+03,  1.30274641e+05, -8.36608636e+03, -9.04403780e+02,
       -8.06395821e+03,  1.08511190e+02, -4.10072266e+03,  1.20977690e+03,
        1.68864061e+02,  5.27816466e+04,  3.96526259e+04, -1.14681775e+04,
       -2.14939121e+03, -1.17232845e+02,  3.35133857e+03,  1.83210397e+04,
        5.85146047e+04,  9.61640796e+03,  1.30209094e+04,  1.88455383e+03,
        1.18924681e+04,  2.63087807e+04,  2.44593883e+04, -5.80101916e+03,
        1.28081828e+04, -8.82424866e+03,  4.82911992e+03, -3.27870201e+04,
       -2.36219426e+04, -2.70059236e+04,  1.82965959e+04, -7.80991406e+03,
        7.91194436e+03, -1.00019270e+04, -3.62325082e+04, -2.61917663e+04,
       -3.20129704e+04, -1.65826056e+05, -1.19324930e+05,  8.96789447e+03,
       -2.06093876e+05, -1.64299350e+05,  3.41866304e+04, -1.13659228e+04,
        2.41595607e+04,  

*Credit to Susan for add_constant to simplify creating an intercept process*

In [131]:
ols = sm.OLS(y_train, sm.add_constant(X1_train_transformed)).fit()
summary_table = ols.summary()
summary_table

0,1,2,3
Dep. Variable:,SalePrice,R-squared:,0.887
Model:,OLS,Adj. R-squared:,0.879
Method:,Least Squares,F-statistic:,101.6
Date:,"Thu, 05 Oct 2023",Prob (F-statistic):,0.0
Time:,18:13:06,Log-Likelihood:,-19046.0
No. Observations:,1640,AIC:,38330.0
Df Residuals:,1521,BIC:,38970.0
Df Model:,118,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-9583.0998,1.38e+05,-0.069,0.945,-2.8e+05,2.61e+05
x1,-6643.5897,1.48e+04,-0.448,0.654,-3.57e+04,2.24e+04
x2,8305.7943,1.13e+04,0.738,0.461,-1.38e+04,3.04e+04
x3,-5347.0861,9322.830,-0.574,0.566,-2.36e+04,1.29e+04
x4,1.471e+04,1.1e+04,1.334,0.182,-6915.018,3.63e+04
x5,3567.5347,8022.367,0.445,0.657,-1.22e+04,1.93e+04
x6,1.98e+04,8972.323,2.207,0.027,2200.354,3.74e+04
x7,-1.242e+04,8600.842,-1.444,0.149,-2.93e+04,4447.756
x8,2257.4763,8166.563,0.276,0.782,-1.38e+04,1.83e+04

0,1,2,3
Omnibus:,852.496,Durbin-Watson:,1.998
Prob(Omnibus):,0.0,Jarque-Bera (JB):,71537.404
Skew:,-1.534,Prob(JB):,0.0
Kurtosis:,35.21,Cond. No.,1.02e+16


In [132]:
ct.get_feature_names_out()

array(['oh__Neighborhood_Blueste', 'oh__Neighborhood_BrDale',
       'oh__Neighborhood_BrkSide', 'oh__Neighborhood_ClearCr',
       'oh__Neighborhood_CollgCr', 'oh__Neighborhood_Crawfor',
       'oh__Neighborhood_Edwards', 'oh__Neighborhood_Gilbert',
       'oh__Neighborhood_Greens', 'oh__Neighborhood_GrnHill',
       'oh__Neighborhood_IDOTRR', 'oh__Neighborhood_Landmrk',
       'oh__Neighborhood_MeadowV', 'oh__Neighborhood_Mitchel',
       'oh__Neighborhood_NAmes', 'oh__Neighborhood_NPkVill',
       'oh__Neighborhood_NWAmes', 'oh__Neighborhood_NoRidge',
       'oh__Neighborhood_NridgHt', 'oh__Neighborhood_OldTown',
       'oh__Neighborhood_SWISU', 'oh__Neighborhood_Sawyer',
       'oh__Neighborhood_SawyerW', 'oh__Neighborhood_Somerst',
       'oh__Neighborhood_StoneBr', 'oh__Neighborhood_Timber',
       'oh__Neighborhood_Veenker', 'oh__Condition 1_Feedr',
       'oh__Condition 1_Norm', 'oh__Condition 1_PosA',
       'oh__Condition 1_PosN', 'oh__Condition 1_RRAe',
       'oh__Condition

In [133]:
test = pd.read_csv('../datasets/test.csv')

In [134]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878 entries, 0 to 877
Data columns (total 80 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Id               878 non-null    int64  
 1   PID              878 non-null    int64  
 2   MS SubClass      878 non-null    int64  
 3   MS Zoning        878 non-null    object 
 4   Lot Frontage     718 non-null    float64
 5   Lot Area         878 non-null    int64  
 6   Street           878 non-null    object 
 7   Alley            58 non-null     object 
 8   Lot Shape        878 non-null    object 
 9   Land Contour     878 non-null    object 
 10  Utilities        878 non-null    object 
 11  Lot Config       878 non-null    object 
 12  Land Slope       878 non-null    object 
 13  Neighborhood     878 non-null    object 
 14  Condition 1      878 non-null    object 
 15  Condition 2      878 non-null    object 
 16  Bldg Type        878 non-null    object 
 17  House Style     

In [135]:
#For the columns I am using, the solution for missing values was to fill na with 
test.fillna('NA', inplace=True)

In [136]:
test_columns = ['Overall Qual', 
                'Gr Liv Area', 
                'Garage Area', 
                'Garage Cars', 
                'Total Bsmt SF', 
                '1st Flr SF', 
                'Year Built', 
                'Full Bath', 
                'Fireplaces', 
                'MS SubClass', 
                'Neighborhood', 
                'Condition 1', 
                'Exter Qual', 
                'Bsmt Exposure', 
                'Kitchen Qual', 
                'Garage Qual',
                'Exterior 1st',
                'Exter Cond',
                'Bsmt Qual',
                'Bsmt Cond',
                'BsmtFin Type 1',
                'Functional',
                'Fireplace Qu',
                'Paved Drive',
                'Sale Type',
                'Garage Cond']
X_test = test[test_columns]

In [137]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878 entries, 0 to 877
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Overall Qual    878 non-null    int64 
 1   Gr Liv Area     878 non-null    int64 
 2   Garage Area     878 non-null    int64 
 3   Garage Cars     878 non-null    int64 
 4   Total Bsmt SF   878 non-null    int64 
 5   1st Flr SF      878 non-null    int64 
 6   Year Built      878 non-null    int64 
 7   Full Bath       878 non-null    int64 
 8   Fireplaces      878 non-null    int64 
 9   MS SubClass     878 non-null    int64 
 10  Neighborhood    878 non-null    object
 11  Condition 1     878 non-null    object
 12  Exter Qual      878 non-null    object
 13  Bsmt Exposure   878 non-null    object
 14  Kitchen Qual    878 non-null    object
 15  Garage Qual     878 non-null    object
 16  Exterior 1st    878 non-null    object
 17  Exter Cond      878 non-null    object
 18  Bsmt Qual 

In [138]:
#For Garage Qual & Bsmt Exposure, I can input NA for missing values as I did for training data

In [139]:

ct_full = ColumnTransformer([('oh', OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop='first'), X1_str)
], remainder='passthrough')  

imputer_full = SimpleImputer(strategy='median')

In [140]:
X_transformed= ct.fit_transform(X1)
X_transformed= imputer_full.fit_transform(X_transformed)

In [141]:
lrfull= LinearRegression()
lrfull.fit(X_transformed, y)

In [142]:
X_test_transformed = ct.transform(X_test)



In [143]:
saleprice = lrfull.predict(X_test_transformed)

In [144]:
saleprice.shape

(878,)

In [145]:
data = {'Id':test['Id'], 'SalePrice': saleprice}
test_sub = pd.DataFrame(data)

In [146]:
test_sub.shape

(878, 2)

In [147]:
test_sub.to_csv('../datasets/submission4.csv', index=False)