# Modeling - Model 3

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot  as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer, make_column_transformer

## Load clean training and test data

In [2]:
train = pd.read_csv('../data/cleaned_data/train_clean.csv')
test = pd.read_csv('../data/cleaned_data/test_clean.csv')

## Predictors and target Variables

In [3]:
predictors = [
    '1st Flr SF',
    '2nd Flr SF',
    'Bedroom AbvGr',
    'Bldg Type',
    'Bsmt Cond',
    'Bsmt Exposure',
    'Bsmt Full Bath',
    'Bsmt Qual',
    'Bsmt Unf SF',
    'BsmtFin SF 1',
    'BsmtFin SF 2',
    'BsmtFin Type 1',
    'BsmtFin Type 2',
    'Central Air',
    'Electrical',
    'Enclosed Porch',
    'Exter Qual',
    'Exter Cond',
    'Fence',
    'Fireplace Qu',
    'Fireplaces',
    'Foundation',
    'Full Bath',
    'Garage Area',
    'Garage Cars',
    'Garage Cond',
    'Garage Finish',
    'Garage Qual',
    'Garage Type',
    'Garage Yr Blt',
    'Gr Liv Area',
    'Half Bath',
    'Heating QC',
    'House Style',
    'Kitchen Qual',
    'Land Contour',
    'Lot Area',
    'Lot Frontage',
    'Lot Shape',
    'MS SubClass',
    'MS Zoning',
    'Mas Vnr Area',
    'Mas Vnr Type',
    'Neighborhood',
    'Open Porch SF',
    'Overall Qual',
    'PID',
    'Paved Drive',
    'Screen Porch',
    'Street',
    'TotRms AbvGrd',
    'Total Bsmt SF',
    'Wood Deck SF',
    'Year Built',
    'Year Remod/Add',
]

In [4]:
X = train[predictors]
y = train['SalePrice']

## Split data into train and test sets

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 21)

In [6]:
X_train.shape, y_test.shape

((1640, 55), (411,))

## Column Transformer

In [7]:
categorical_cols = train[predictors].select_dtypes(exclude=['number']).columns.tolist()

In [8]:
categorical_cols

['Bldg Type',
 'Bsmt Cond',
 'Bsmt Exposure',
 'Bsmt Qual',
 'BsmtFin Type 1',
 'BsmtFin Type 2',
 'Central Air',
 'Electrical',
 'Exter Qual',
 'Exter Cond',
 'Fence',
 'Fireplace Qu',
 'Foundation',
 'Garage Cond',
 'Garage Finish',
 'Garage Qual',
 'Garage Type',
 'Heating QC',
 'House Style',
 'Kitchen Qual',
 'Land Contour',
 'Lot Shape',
 'MS Zoning',
 'Mas Vnr Type',
 'Neighborhood',
 'Paved Drive',
 'Street']

In [9]:
OHE = OneHotEncoder(drop='first', sparse=False, handle_unknown='ignore')

In [10]:
ct = make_column_transformer(
    (OHE, categorical_cols),
    remainder='passthrough'
)

## Transform training and testing data

In [11]:
X_train_ct = ct.fit_transform(X_train)



In [13]:
X_test_ct = ct.transform(X_test)



## Initialize and train Linear Regression Model

In [14]:
model = LinearRegression()
model.fit(X_train_ct, y_train)

## Evaluate Model

In [15]:
# scoring model on training set
model.score(X_train_ct, y_train)

0.9093321778070841

In [16]:
#scoring model on testing set
model.score(X_test_ct, y_test)

0.8404030525128556

In [17]:
np.mean(cross_val_score(model, X_test_ct, y_test, cv=5))

0.6511856290008295

In [18]:
# setting baseline
y_pred_baseline = [np.mean(y_train)] * len(y_test)

In [19]:
# scoring mean baseline on testing set
r2_score(y_test, y_pred_baseline)

-0.014887741251467856

## Transform and Make Predictions on Unseen Testing Set

In [20]:
test_features = test[predictors]
test_features_ct = ct.transform(test_features)



In [21]:
predictions = model.predict(test_features_ct)

In [22]:
predictions_df = pd.DataFrame({'Id': test['Id'], 'SalePrice': predictions})

In [23]:
predictions_df

Unnamed: 0,Id,SalePrice
0,2658,115083.046145
1,2718,159952.469697
2,2414,208830.830829
3,1989,112455.074630
4,625,187896.701558
...,...,...
873,1662,179010.373387
874,1234,204017.888083
875,1373,132692.589501
876,1672,120495.852895


In [24]:
predictions_df.to_csv('../data/submission/model_3.csv', index=False)