# Housing Prices Analysis and Predictions with Python

**Importing modules**

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import LabelEncoder # Machine learning
import seaborn as sns # data visualization
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['test.csv', 'kernel.ipynb', '.ipynb_checkpoints', 'train.csv']


**Importing Data and creating DataFrames**

In [2]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

**Using Head(), Describe(), Info() to collect general information**

In [3]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
Id               1460 non-null int64
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            91 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-n

In [5]:
train.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


# Categorical Features

In [6]:
train = pd.get_dummies(train)
test = pd.get_dummies(test)

print('Training Features shape: ', train.shape)
print('Testing Features shape: ', test.shape)

Training Features shape:  (1460, 290)
Testing Features shape:  (1459, 271)


# Replacing NaN Values in the dataset

In [7]:
train = train.apply(lambda x: x.fillna(x.mean()),axis=0)
test = test.apply(lambda x: x.fillna(x.mean()),axis=0)

In [8]:
train.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,0,0,0,1,0,0,0,0,1,0
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,0,0,0,1,0,0,0,0,1,0
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,0,0,0,1,0,0,0,0,1,0
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,0,0,0,1,1,0,0,0,0,0
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,0,0,0,1,0,0,0,0,1,0


In [9]:
test.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1461,20,80.0,11622,5,6,1961,1961,0.0,468.0,...,0,0,0,1,0,0,0,0,1,0
1,1462,20,81.0,14267,6,6,1958,1958,108.0,923.0,...,0,0,0,1,0,0,0,0,1,0
2,1463,60,74.0,13830,5,5,1997,1998,0.0,791.0,...,0,0,0,1,0,0,0,0,1,0
3,1464,60,78.0,9978,6,6,1998,1998,20.0,602.0,...,0,0,0,1,0,0,0,0,1,0
4,1465,120,43.0,5005,8,5,1992,1992,0.0,263.0,...,0,0,0,1,0,0,0,0,1,0


# Aligning the dataframes

In [10]:
train_labels = train['SalePrice']

# Align the training and testing data, keep only columns present in both dataframes
train, test = train.align(test, join = 'inner', axis = 1)

print('Training Features shape: ', train.shape)
print('Testing Features shape: ', test.shape)

Training Features shape:  (1460, 271)
Testing Features shape:  (1459, 271)


# Machine Learning

In [11]:
X_train = train
Y_train = train_labels
X_test  = test
X_train.shape, Y_train.shape, X_test.shape

((1460, 271), (1460,), (1459, 271))

# GridSearch for Logistic Regression

I used grid search to find best parameters for Logistic Regression.

Below it is commented because otherwise the Kernel will require a lot of time to be launched.

The best parameters found are the following:

{'C': 0.001, 'max_iter': 5000}

In [12]:
param_grid = {'C': [0.001, 0.01, 0.1, 1], 'max_iter' : [5000, 10000, 15000, 20000]}
clf = GridSearchCV(LogisticRegression(penalty='l2'), param_grid)
clf = clf.fit(X_train, Y_train)
Y_pred = regr.predict(X_test)

In [13]:
print(clf.best_params_)

# Logistic Regression (with best params found with GridSearch)

In [14]:
from sklearn.linear_model import LogisticRegression
regr = LogisticRegression(max_iter=5000, C=0.001)
regr.fit(X_train, Y_train)
Y_pred = regr.predict(X_test)

# XGBoost with GridSearch

In [15]:
from xgboost import XGBRegressor

param_grid = {
    'colsample_bytree':[0.6],
    'gamma':[0,1],
    'min_child_weight':[5,10],
    'learning_rate':[0.01, 0.05],
    'max_depth':[1, 2],
    'n_estimators':[10000, 12000, 15000],
    'subsample':[0.6]  
}

xgb = XGBRegressor() 
grid = GridSearchCV(xgb, param_grid, verbose=2)
grid = grid.fit(X_train, Y_train)
Y_pred = grid.predict(X_test)

Fitting 3 folds for each of 48 candidates, totalling 144 fits
[CV] colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_depth=1, min_child_weight=5, n_estimators=10000, subsample=0.6 
[CV]  colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_depth=1, min_child_weight=5, n_estimators=10000, subsample=0.6, total=  17.2s
[CV] colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_depth=1, min_child_weight=5, n_estimators=10000, subsample=0.6 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   17.6s remaining:    0.0s


[CV]  colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_depth=1, min_child_weight=5, n_estimators=10000, subsample=0.6, total=  20.4s
[CV] colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_depth=1, min_child_weight=5, n_estimators=10000, subsample=0.6 
[CV]  colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_depth=1, min_child_weight=5, n_estimators=10000, subsample=0.6, total=  18.7s
[CV] colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_depth=1, min_child_weight=5, n_estimators=12000, subsample=0.6 
[CV]  colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_depth=1, min_child_weight=5, n_estimators=12000, subsample=0.6, total=  21.8s
[CV] colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_depth=1, min_child_weight=5, n_estimators=12000, subsample=0.6 
[CV]  colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_depth=1, min_child_weight=5, n_estimators=12000, subsample=0.6, total=  22.3s
[CV] colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_depth=1, min

[CV]  colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_depth=2, min_child_weight=10, n_estimators=15000, subsample=0.6, total=  47.5s
[CV] colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_depth=2, min_child_weight=10, n_estimators=15000, subsample=0.6 
[CV]  colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_depth=2, min_child_weight=10, n_estimators=15000, subsample=0.6, total=  47.1s
[CV] colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_depth=2, min_child_weight=10, n_estimators=15000, subsample=0.6 
[CV]  colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_depth=2, min_child_weight=10, n_estimators=15000, subsample=0.6, total=  47.8s
[CV] colsample_bytree=0.6, gamma=0, learning_rate=0.05, max_depth=1, min_child_weight=5, n_estimators=10000, subsample=0.6 
[CV]  colsample_bytree=0.6, gamma=0, learning_rate=0.05, max_depth=1, min_child_weight=5, n_estimators=10000, subsample=0.6, total=  18.3s
[CV] colsample_bytree=0.6, gamma=0, learning_rate=0.05, max_depth=1

[CV]  colsample_bytree=0.6, gamma=0, learning_rate=0.05, max_depth=2, min_child_weight=10, n_estimators=10000, subsample=0.6, total=  25.5s
[CV] colsample_bytree=0.6, gamma=0, learning_rate=0.05, max_depth=2, min_child_weight=10, n_estimators=12000, subsample=0.6 
[CV]  colsample_bytree=0.6, gamma=0, learning_rate=0.05, max_depth=2, min_child_weight=10, n_estimators=12000, subsample=0.6, total=  30.4s
[CV] colsample_bytree=0.6, gamma=0, learning_rate=0.05, max_depth=2, min_child_weight=10, n_estimators=12000, subsample=0.6 
[CV]  colsample_bytree=0.6, gamma=0, learning_rate=0.05, max_depth=2, min_child_weight=10, n_estimators=12000, subsample=0.6, total=  30.4s
[CV] colsample_bytree=0.6, gamma=0, learning_rate=0.05, max_depth=2, min_child_weight=10, n_estimators=12000, subsample=0.6 
[CV]  colsample_bytree=0.6, gamma=0, learning_rate=0.05, max_depth=2, min_child_weight=10, n_estimators=12000, subsample=0.6, total=  30.4s
[CV] colsample_bytree=0.6, gamma=0, learning_rate=0.05, max_depth

[CV]  colsample_bytree=0.6, gamma=1, learning_rate=0.01, max_depth=2, min_child_weight=5, n_estimators=15000, subsample=0.6, total=  42.4s
[CV] colsample_bytree=0.6, gamma=1, learning_rate=0.01, max_depth=2, min_child_weight=5, n_estimators=15000, subsample=0.6 
[CV]  colsample_bytree=0.6, gamma=1, learning_rate=0.01, max_depth=2, min_child_weight=5, n_estimators=15000, subsample=0.6, total=  40.7s
[CV] colsample_bytree=0.6, gamma=1, learning_rate=0.01, max_depth=2, min_child_weight=10, n_estimators=10000, subsample=0.6 
[CV]  colsample_bytree=0.6, gamma=1, learning_rate=0.01, max_depth=2, min_child_weight=10, n_estimators=10000, subsample=0.6, total=  27.1s
[CV] colsample_bytree=0.6, gamma=1, learning_rate=0.01, max_depth=2, min_child_weight=10, n_estimators=10000, subsample=0.6 
[CV]  colsample_bytree=0.6, gamma=1, learning_rate=0.01, max_depth=2, min_child_weight=10, n_estimators=10000, subsample=0.6, total=  27.2s
[CV] colsample_bytree=0.6, gamma=1, learning_rate=0.01, max_depth=2,

[CV]  colsample_bytree=0.6, gamma=1, learning_rate=0.05, max_depth=2, min_child_weight=5, n_estimators=12000, subsample=0.6, total=  32.9s
[CV] colsample_bytree=0.6, gamma=1, learning_rate=0.05, max_depth=2, min_child_weight=5, n_estimators=12000, subsample=0.6 
[CV]  colsample_bytree=0.6, gamma=1, learning_rate=0.05, max_depth=2, min_child_weight=5, n_estimators=12000, subsample=0.6, total=  32.9s
[CV] colsample_bytree=0.6, gamma=1, learning_rate=0.05, max_depth=2, min_child_weight=5, n_estimators=12000, subsample=0.6 
[CV]  colsample_bytree=0.6, gamma=1, learning_rate=0.05, max_depth=2, min_child_weight=5, n_estimators=12000, subsample=0.6, total=  30.5s
[CV] colsample_bytree=0.6, gamma=1, learning_rate=0.05, max_depth=2, min_child_weight=5, n_estimators=15000, subsample=0.6 
[CV]  colsample_bytree=0.6, gamma=1, learning_rate=0.05, max_depth=2, min_child_weight=5, n_estimators=15000, subsample=0.6, total=  38.0s
[CV] colsample_bytree=0.6, gamma=1, learning_rate=0.05, max_depth=2, min

[Parallel(n_jobs=1)]: Done 144 out of 144 | elapsed: 67.5min finished


In [16]:
print(grid.best_params_)

{'colsample_bytree': 0.6, 'gamma': 0, 'learning_rate': 0.01, 'max_depth': 2, 'min_child_weight': 10, 'n_estimators': 12000, 'subsample': 0.6}


In [None]:
my_submission = pd.DataFrame({'Id': test.Id, 'SalePrice': Y_pred})
# you could use any filename. We choose submission here
my_submission.to_csv('submission.csv', index=False)