In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

from sklearn import metrics
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet

**For this model I analayzed the heatmap/ correlations ofnumeric columns with sale price and the catterplots/ bargraphs of the categorical columns and sale price. I will start witha model with just these variables and then use stepwise regression to add and drop variables**

In [2]:
clean_data = pd.read_csv('datasets/clean_df.csv')

In [3]:
clean_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051 entries, 0 to 2050
Data columns (total 82 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       2051 non-null   int64  
 1   Id               2051 non-null   int64  
 2   PID              2051 non-null   int64  
 3   MS SubClass      2051 non-null   int64  
 4   MS Zoning        2051 non-null   object 
 5   Lot Frontage     1721 non-null   float64
 6   Lot Area         2051 non-null   int64  
 7   Street           2051 non-null   object 
 8   Alley            140 non-null    object 
 9   Lot Shape        2051 non-null   object 
 10  Land Contour     2051 non-null   object 
 11  Utilities        2051 non-null   object 
 12  Lot Config       2051 non-null   object 
 13  Land Slope       2051 non-null   object 
 14  Neighborhood     2051 non-null   object 
 15  Condition 1      2051 non-null   object 
 16  Condition 2      2051 non-null   object 
 17  Bldg Type     

In [4]:
Columns_to_include = ['SalePrice', 
                      'Overall Qual', 
                      'Gr Liv Area', 
                      'Garage Area', 
                      'Garage Cars', 
                      'Total Bsmt SF', 
                      '1st Flr SF', 
                      'Year Built', 
                      'Full Bath', 
                      'Fireplaces', 
                      'MS SubClass', 
                      'Neighborhood', 
                      'Condition 1', 
                      'Exter Qual', 
                      'Bsmt Exposure', 
                      'Kitchen Qual', 
                      'Garage Qual']

In [5]:
clean_columns = clean_data[Columns_to_include]

In [6]:
# Create a list of string-type column names
string_columns = clean_data.select_dtypes(include=['object']).columns

column_to_exclude = 'Mas Vnr Type'

# Use a for loop to input NA in each column in dataframe that is in list except Mas Vnr Type
for column in string_columns:
    if column != column_to_exclude:
        clean_data[column].fillna('NA', inplace=True)

In [7]:
X1 = clean_data[Columns_to_include].drop(columns='SalePrice')
y = clean_data['SalePrice']

In [8]:
X1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051 entries, 0 to 2050
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Overall Qual   2051 non-null   int64  
 1   Gr Liv Area    2051 non-null   int64  
 2   Garage Area    2050 non-null   float64
 3   Garage Cars    2050 non-null   float64
 4   Total Bsmt SF  2051 non-null   float64
 5   1st Flr SF     2051 non-null   int64  
 6   Year Built     2051 non-null   int64  
 7   Full Bath      2051 non-null   int64  
 8   Fireplaces     2051 non-null   int64  
 9   MS SubClass    2051 non-null   int64  
 10  Neighborhood   2051 non-null   object 
 11  Condition 1    2051 non-null   object 
 12  Exter Qual     2051 non-null   object 
 13  Bsmt Exposure  2051 non-null   object 
 14  Kitchen Qual   2051 non-null   object 
 15  Garage Qual    2051 non-null   object 
dtypes: float64(3), int64(7), object(6)
memory usage: 256.5+ KB


In [9]:
X1['Garage Qual'].value_counts()

TA    1832
NA     114
Fa      82
Gd      18
Ex       3
Po       2
Name: Garage Qual, dtype: int64

In [10]:
y.shape

(2051,)

In [11]:
X1_train, X1_val, y_train, y_val = train_test_split(X1, y, test_size= 0.2, random_state=24)

In [12]:
X1_str = list(X1.select_dtypes(include=['object']).columns)

In [13]:
#I can either add imputer to the column transformer
#or if I don't add polynomical features back in, I can just remove ct & run each separate
#
ct = ColumnTransformer([('oh', OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop='first'), X1_str)
], remainder='passthrough')  

# Fit and transform the training data using the ColumnTransformer
X1_train_transformed = ct.fit_transform(X1_train)

# Transform the test data using the fitted ColumnTransformer
X1_val_transformed = ct.transform(X1_val)


#Double-check this... why would I need an imputer? I shouldn't have any numeric columns with missings
imputer = SimpleImputer(strategy='mean')
X1_train_transformed = imputer.fit_transform(X1_train_transformed)
X1_val_transofrmed = imputer.transform(X1_val_transformed)


In [14]:
lr=LinearRegression()

In [15]:
lr.fit(X1_train_transformed, y_train)

In [16]:
r2_train = lr.score(X1_train_transformed, y_train)
r2_val = lr.score(X1_val_transformed, y_val)
print(r2_train)
print(r2_val)

0.8696229428912882
0.8996500011620334


In [17]:
y_preds_train = lr.predict(X1_train_transformed)
metrics.mean_squared_error(y_train, y_preds_train)

829055137.357763

In [18]:
y_preds_val = lr.predict(X1_val_transformed)
metrics.mean_squared_error(y_val, y_preds_val)

598023852.2266089

In [19]:
test = pd.read_csv('datasets/test.csv')

In [20]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878 entries, 0 to 877
Data columns (total 80 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Id               878 non-null    int64  
 1   PID              878 non-null    int64  
 2   MS SubClass      878 non-null    int64  
 3   MS Zoning        878 non-null    object 
 4   Lot Frontage     718 non-null    float64
 5   Lot Area         878 non-null    int64  
 6   Street           878 non-null    object 
 7   Alley            58 non-null     object 
 8   Lot Shape        878 non-null    object 
 9   Land Contour     878 non-null    object 
 10  Utilities        878 non-null    object 
 11  Lot Config       878 non-null    object 
 12  Land Slope       878 non-null    object 
 13  Neighborhood     878 non-null    object 
 14  Condition 1      878 non-null    object 
 15  Condition 2      878 non-null    object 
 16  Bldg Type        878 non-null    object 
 17  House Style     

In [21]:
#For the columns I am using, the solution for missing values was to fill na with 
test.fillna('NA', inplace=True)

In [22]:
test_columns = ['Overall Qual', 'Gr Liv Area', 'Garage Area', 'Garage Cars', 'Total Bsmt SF', '1st Flr SF', 'Year Built', 'Full Bath', 'Fireplaces', 'MS SubClass', 'Neighborhood', 'Exter Qual',  'Condition 1', 'Bsmt Exposure', 'Kitchen Qual', 'Garage Qual']
X_test = test[test_columns]

In [23]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878 entries, 0 to 877
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Overall Qual   878 non-null    int64 
 1   Gr Liv Area    878 non-null    int64 
 2   Garage Area    878 non-null    int64 
 3   Garage Cars    878 non-null    int64 
 4   Total Bsmt SF  878 non-null    int64 
 5   1st Flr SF     878 non-null    int64 
 6   Year Built     878 non-null    int64 
 7   Full Bath      878 non-null    int64 
 8   Fireplaces     878 non-null    int64 
 9   MS SubClass    878 non-null    int64 
 10  Neighborhood   878 non-null    object
 11  Exter Qual     878 non-null    object
 12  Condition 1    878 non-null    object
 13  Bsmt Exposure  878 non-null    object
 14  Kitchen Qual   878 non-null    object
 15  Garage Qual    878 non-null    object
dtypes: int64(10), object(6)
memory usage: 109.9+ KB


In [24]:
X['Kitchen Qual'].value_counts()

NameError: name 'X' is not defined

In [None]:
#For Garage Qual & Bsmt Exposure, I can input NA for missing values as I did for training data

In [None]:
X_test_transformed = ct.transform(X_test)

In [None]:
X['Condition 1'].value_counts()

In [None]:
saleprice = lr.predict(X_test_transformed)

In [None]:
saleprice.shape

In [None]:
data = {'Id':test['Id'], 'SalePrice': saleprice}
test_sub = pd.DataFrame(data)

In [None]:
test_sub.shape

In [None]:
test_sub.to_csv('datasets/submission1.csv', index=False)