# Modeling - Model 2

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot  as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

## Load clean training and test data

In [2]:
train = pd.read_csv('../data/cleaned_data/train_clean.csv')
test = pd.read_csv('../data/cleaned_data/test_clean.csv')

## Predictors and target Variables

In [3]:
predictors = [
    'Overall Qual',
    'Year Built',
    'Year Remod/Add',
    'Mas Vnr Area',
    'Total Bsmt SF',
    '1st Flr SF',
    'Gr Liv Area',
    'Full Bath',
    'TotRms AbvGrd',
    'Garage Cars',
    'Garage Area',
    'Lot Area',
    'Fireplaces',
    'Wood Deck SF',
    'Open Porch SF',
    'Foundation',
    'Exter Qual',
    'Mas Vnr Type',
    'Kitchen Qual',
    'Bsmt Qual',
    'Garage Type',
    'Fireplace Qu'
]

In [4]:
X = train[predictors]
y = train['SalePrice']

## Split data into train and test sets

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 21)

In [6]:
X_train.shape, y_test.shape

((1640, 22), (411,))

## Column Transformer

In [7]:
categorical_cols = ['Foundation', 'Exter Qual', 'Mas Vnr Type', 'Kitchen Qual', 'Bsmt Qual', 'Garage Type', 'Fireplace Qu']

In [8]:
numerical_cols = ['Overall Qual', 'Year Built', 'Year Remod/Add', 'Mas Vnr Area', 'Total Bsmt SF', '1st Flr SF', 'Gr Liv Area', 'Full Bath', 'TotRms AbvGrd', 'Garage Cars', 'Garage Area', 'Lot Area', 'Fireplaces', 'Wood Deck SF', 'Open Porch SF']

In [9]:
ct = ColumnTransformer([
    ('sc', StandardScaler(), numerical_cols),
    ('oh', OneHotEncoder(drop='first', sparse=False, handle_unknown='ignore'), categorical_cols)
 ], remainder='passthrough')

# special thanks to Kaity Ellsweig and Qingxin Wei for assistance wite OneHotEncoder and ColumnTransformer

## Transform training and testing data

In [10]:
X_train_ct = ct.fit_transform(X_train)



In [11]:
X_test_ct = ct.transform(X_test)

## Initialize and train Linear Regression Model

In [12]:
model = LinearRegression()
model.fit(X_train_ct, y_train)

## Evaluate Model

In [13]:
# scoring model on training set
model.score(X_train_ct, y_train)

0.8604057444446076

In [14]:
#scoring model on testing set
model.score(X_test_ct, y_test)

0.7825556959280188

In [15]:
np.mean(cross_val_score(model, X_test_ct, y_test, cv=5))

0.7287663419728482

In [16]:
# setting baseline
y_pred_baseline = [np.mean(y_train)] * len(y_test)

In [17]:
# scoring mean baseline on testing set
r2_score(y_test, y_pred_baseline)

-0.014887741251467856

## Transform and Make Predictions on Unseen Testing Set

In [18]:
test_features = test[predictors]
test_features_ct = ct.transform(test_features)



In [19]:
predictions = model.predict(test_features_ct)

In [20]:
predictions_df = pd.DataFrame({'Id': test['Id'], 'SalePrice': predictions})

In [21]:
predictions_df

Unnamed: 0,Id,SalePrice
0,2658,151955.439285
1,2718,192236.548488
2,2414,205479.821330
3,1989,128695.176568
4,625,181204.676557
...,...,...
873,1662,180152.769880
874,1234,218791.931195
875,1373,132528.465931
876,1672,107776.465387


In [22]:
predictions_df.to_csv('../data/submission/model_2.csv', index=False)