In [58]:
import os
if 'notebooks' in os.getcwd():
    os.chdir('..')
    
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, roc_auc_score,\
    mean_squared_error

## Visualizing data and columns

In [5]:
os.getcwd()

'd:\\Desafios Programação\\house-prices'

In [6]:
train_df = pd.read_csv("data/train.csv")
train_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [11]:
target_df = train_df[["Id", "SalePrice"]]

In [22]:
numerical_columns = train_df.dtypes == ('int64' or 'float64')

In [28]:
numerical_df = train_df[train_df.columns[numerical_columns]]
numerical_df.head()

Unnamed: 0,Id,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
0,1,60,8450,7,5,2003,2003,706,0,150,...,0,61,0,0,0,0,0,2,2008,208500
1,2,20,9600,6,8,1976,1976,978,0,284,...,298,0,0,0,0,0,0,5,2007,181500
2,3,60,11250,7,5,2001,2002,486,0,434,...,0,42,0,0,0,0,0,9,2008,223500
3,4,70,9550,7,5,1915,1970,216,0,540,...,0,35,272,0,0,0,0,2,2006,140000
4,5,60,14260,8,5,2000,2000,655,0,490,...,192,84,0,0,0,0,0,12,2008,250000


In [33]:
target_correlation = abs(numerical_df.corrwith(target_df["SalePrice"]))\
    .sort_values()

In [63]:
target_correlation.tail(11)

YearRemodAdd    0.507101
YearBuilt       0.522897
TotRmsAbvGrd    0.533723
FullBath        0.560664
1stFlrSF        0.605852
TotalBsmtSF     0.613581
GarageArea      0.623431
GarageCars      0.640409
GrLivArea       0.708624
OverallQual     0.790982
SalePrice       1.000000
dtype: float64

In [64]:
most_correlated_columns = target_correlation\
    .tail(11)\
    .index[:-1]

In [65]:
X = train_df[most_correlated_columns]

In [66]:
X

Unnamed: 0,YearRemodAdd,YearBuilt,TotRmsAbvGrd,FullBath,1stFlrSF,TotalBsmtSF,GarageArea,GarageCars,GrLivArea,OverallQual
0,2003,2003,8,2,856,856,548,2,1710,7
1,1976,1976,6,2,1262,1262,460,2,1262,6
2,2002,2001,6,2,920,920,608,2,1786,7
3,1970,1915,7,1,961,756,642,3,1717,7
4,2000,2000,9,2,1145,1145,836,3,2198,8
...,...,...,...,...,...,...,...,...,...,...
1455,2000,1999,7,2,953,953,460,2,1647,6
1456,1988,1978,7,2,2073,1542,500,2,2073,6
1457,2006,1941,9,2,1188,1152,252,1,2340,7
1458,1996,1950,5,1,1078,1078,240,1,1078,5


In [67]:
y = target_df["SalePrice"]

In [68]:
y

0       208500
1       181500
2       223500
3       140000
4       250000
         ...  
1455    175000
1456    210000
1457    266500
1458    142125
1459    147500
Name: SalePrice, Length: 1460, dtype: int64

## Using Linear Models

In [69]:
model = LinearRegression()

In [70]:
model.fit(X,y)

LinearRegression()

In [71]:
model.coef_

array([ 2.96481161e+02,  2.68240707e+02,  3.31050771e+01, -6.79087146e+03,
        1.41737355e+01,  1.98650991e+01,  1.49475334e+01,  1.04179010e+04,
        5.12971178e+01,  1.96045898e+04])

In [72]:
model.predict(X)

array([214741.72468546, 169346.70949012, 220816.47032609, ...,
       227093.91227879, 126047.07735367, 136640.78777021])

In [73]:
y

0       208500
1       181500
2       223500
3       140000
4       250000
         ...  
1455    175000
1456    210000
1457    266500
1458    142125
1459    147500
Name: SalePrice, Length: 1460, dtype: int64

In [74]:
y_pred = model.predict(X)

In [75]:
r2_score(y, y_pred)

0.7736928402773303

In [76]:
np.sqrt(mean_squared_error(y,y_pred))

37779.245780213154

In [83]:
test_df = pd.read_csv("data/test.csv")

In [84]:
test_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [85]:
test_df.shape

(1459, 80)

In [86]:
train_df.shape

(1460, 81)

In [87]:
test_df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [88]:
most_correlated_columns

Index(['YearRemodAdd', 'YearBuilt', 'TotRmsAbvGrd', 'FullBath', '1stFlrSF',
       'TotalBsmtSF', 'GarageArea', 'GarageCars', 'GrLivArea', 'OverallQual'],
      dtype='object')

In [89]:
X_test = test_df[most_correlated_columns]

In [98]:
X_test_replace = X_test.replace(np.nan, 0)

In [100]:
y_pred_test = model.predict(X_test_replace)

In [104]:
submission = pd.Series(
    data = y_pred_test,
    index = test_df["Id"],
    name = "SalePrice"
)

In [105]:
submission.to_csv("data/submission.csv")