In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

from sklearn import metrics
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet

pd.options.display.float_format = '{:.4f}'.format
import statsmodels.api as sm

In [41]:
clean_data = pd.read_csv('../datasets/clean_df.csv')

In [42]:
clean_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051 entries, 0 to 2050
Data columns (total 82 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       2051 non-null   int64  
 1   Id               2051 non-null   int64  
 2   PID              2051 non-null   int64  
 3   MS SubClass      2051 non-null   int64  
 4   MS Zoning        2051 non-null   object 
 5   Lot Frontage     1721 non-null   float64
 6   Lot Area         2051 non-null   int64  
 7   Street           2051 non-null   object 
 8   Alley            140 non-null    object 
 9   Lot Shape        2051 non-null   object 
 10  Land Contour     2051 non-null   object 
 11  Utilities        2051 non-null   object 
 12  Lot Config       2051 non-null   object 
 13  Land Slope       2051 non-null   object 
 14  Neighborhood     2051 non-null   object 
 15  Condition 1      2051 non-null   object 
 16  Condition 2      2051 non-null   object 
 17  Bldg Type     

In [43]:
Columns_to_include = ['SalePrice', 
                      'Overall Qual', 
                      'Gr Liv Area', 
                      'Garage Area', 
                      'Garage Cars', 
                      'Total Bsmt SF', 
                      '1st Flr SF', 
                      'Year Built', 
                      'Full Bath', 
                      'Fireplaces', 
                      'MS SubClass', 
                      'Neighborhood', 
                      'Condition 1', 
                      'Exter Qual', 
                      'Bsmt Exposure', 
                      'Kitchen Qual', 
                      'Garage Qual',
                      'Exterior 1st',
                      'Exter Cond',
                      'Bsmt Qual',
                      'Bsmt Cond',
                      'BsmtFin Type 1',
                      'Functional',
                      'Fireplace Qu',
                      'Paved Drive',
                      'Sale Type',
                      'Garage Cond']

In [44]:
clean_columns = clean_data[Columns_to_include]

In [45]:
# Create a list of string-type column names
string_columns = clean_data.select_dtypes(include=['object']).columns

column_to_exclude = 'Mas Vnr Type'

# Use a for loop to input NA in each column in dataframe that is in list except Mas Vnr Type
for column in string_columns:
    if column != column_to_exclude:
        clean_data[column].fillna('NA', inplace=True)

In [46]:
X1 = clean_data[Columns_to_include].drop(columns='SalePrice')
y = clean_data['SalePrice']

In [47]:
X1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051 entries, 0 to 2050
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Overall Qual    2051 non-null   int64  
 1   Gr Liv Area     2051 non-null   int64  
 2   Garage Area     2050 non-null   float64
 3   Garage Cars     2050 non-null   float64
 4   Total Bsmt SF   2051 non-null   float64
 5   1st Flr SF      2051 non-null   int64  
 6   Year Built      2051 non-null   int64  
 7   Full Bath       2051 non-null   int64  
 8   Fireplaces      2051 non-null   int64  
 9   MS SubClass     2051 non-null   int64  
 10  Neighborhood    2051 non-null   object 
 11  Condition 1     2051 non-null   object 
 12  Exter Qual      2051 non-null   object 
 13  Bsmt Exposure   2051 non-null   object 
 14  Kitchen Qual    2051 non-null   object 
 15  Garage Qual     2051 non-null   object 
 16  Exterior 1st    2051 non-null   object 
 17  Exter Cond      2051 non-null   o

In [48]:
X1['Garage Qual'].value_counts()

TA    1832
NA     114
Fa      82
Gd      18
Ex       3
Po       2
Name: Garage Qual, dtype: int64

In [49]:
y.shape

(2051,)

In [50]:
X1_train, X1_val, y_train, y_val = train_test_split(X1, y, test_size= 0.2, random_state=24)

In [51]:
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import Pipeline

In [53]:
ct1 = make_column_transformer(
    (OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop='first'), make_column_selector(dtype_include=object)),
    remainder='passthrough'
)

ct2 = make_column_transformer(
    (SimpleImputer(strategy='median'), make_column_selector(dtype_exclude=object)),
    remainder='passthrough'
)

pipe = Pipeline([
    ('ct1', ct1),
    ('ct2', ct2),
    ('sc', StandardScaler()), 
    ('poly', PolynomialFeatures(include_bias=False)),
    ('lr', LinearRegression())
])

In [None]:
X1_train_transformed = ct.fit_transform(X1_train)
X1_val_transformed = ct.transform(X1_val)

In [None]:
lr=LinearRegression()

In [None]:
lr.fit(X1_train_transformed, y_train)

In [None]:
r2_train = lr.score(X1_train_transformed, y_train)
r2_val = lr.score(X1_val_transformed, y_val)
print(r2_train)
print(r2_val)

In [None]:
y_preds_train = lr.predict(X1_train_transformed)
metrics.mean_squared_error(y_train, y_preds_train)

In [None]:
y_preds_val = lr.predict(X1_val_transformed)
metrics.mean_squared_error(y_val, y_preds_val)

In [None]:
lr.coef_

*Credit to Susan for add_constant to simplify creating an intercept process*

In [None]:
ols = sm.OLS(y_train, sm.add_constant(X1_train_transformed)).fit()
summary_table = ols.summary()
summary_table

In [None]:
ct.get_feature_names_out()

In [57]:
X_transformed = pipe.fit(X1, y)

ValueError: make_column_selector can only be applied to pandas dataframes

In [None]:
test = pd.read_csv('../datasets/test.csv')

In [None]:
test.info()

In [None]:
#For the columns I am using, the solution for missing values was to fill na with 
#test.fillna('NA', inplace=True)

In [None]:
test_columns = ['Overall Qual', 
                'Gr Liv Area', 
                'Garage Area', 
                'Garage Cars', 
                'Total Bsmt SF', 
                '1st Flr SF', 
                'Year Built', 
                'Full Bath', 
                'Fireplaces', 
                'MS SubClass', 
                'Neighborhood', 
                'Condition 1', 
                'Exter Qual', 
                'Bsmt Exposure', 
                'Kitchen Qual', 
                'Garage Qual',
                'Exterior 1st',
                'Exter Cond',
                'Bsmt Qual',
                'Bsmt Cond',
                'BsmtFin Type 1',
                'Functional',
                'Fireplace Qu',
                'Paved Drive',
                'Sale Type',
                'Garage Cond']
X_test = test[test_columns]

In [None]:
X_test.info()

In [None]:
X = ct.fit_transform(X1)

In [None]:
lrfinal=LinearRegression()
lrfinal.fit(

In [None]:
#For Garage Qual & Bsmt Exposure, I can input NA for missing values as I did for training data

In [None]:
X_test_transformed = ct.transform(X_test)

In [None]:
saleprice = lr.predict(X_test_transformed)

In [None]:
saleprice.shape

In [None]:
data = {'Id':test['Id'], 'SalePrice': saleprice}
test_sub = pd.DataFrame(data)

In [None]:
test_sub.shape

In [None]:
test_sub.to_csv('datasets/submission3.csv', index=False)