## Housing Price Prediction

#### Import dependencies

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import tree,ensemble
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression

#### Import the data

In [6]:
data = pd.read_csv(r'train.csv',encoding='ISO-8859-1')

#### Analyze the data

In [49]:
print(data.head())
print(data.describe())
print(data.info())

   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities    ...     PoolArea PoolQC Fence MiscFeature MiscVal  \
0         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   
1         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   
2         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   
3         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   
4         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   

  MoSold YrSold  SaleType  SaleCondition  SalePrice  
0      2   2008     

#### Null values in dataset

In [5]:
print(data.isnull().sum())

Id                  0
MSSubClass          0
MSZoning            0
LotFrontage       259
LotArea             0
Street              0
Alley            1369
LotShape            0
LandContour         0
Utilities           0
LotConfig           0
LandSlope           0
Neighborhood        0
Condition1          0
Condition2          0
BldgType            0
HouseStyle          0
OverallQual         0
OverallCond         0
YearBuilt           0
YearRemodAdd        0
RoofStyle           0
RoofMatl            0
Exterior1st         0
Exterior2nd         0
MasVnrType          8
MasVnrArea          8
ExterQual           0
ExterCond           0
Foundation          0
                 ... 
BedroomAbvGr        0
KitchenAbvGr        0
KitchenQual         0
TotRmsAbvGrd        0
Functional          0
Fireplaces          0
FireplaceQu       690
GarageType         81
GarageYrBlt        81
GarageFinish       81
GarageCars          0
GarageArea          0
GarageQual         81
GarageCond         81
PavedDrive

#### Impute missing data

In [6]:
for col in data.columns:
    if data[col].isnull().sum()>0.3*len(data[col]):
        data.drop(col,axis=1,inplace=True)
    elif data[col].isnull().sum()>1:
        data.fillna(method='ffill',inplace=True)
        data.fillna(method='bfill',inplace=True)

#### Analyze Categorical Columns

In [50]:
data_cat = data.select_dtypes(include='object')
data_cat.describe()

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
count,1460,1460,91,1460,1460,1460,1460,1460,1460,1460,...,1379,1379,1379,1379,1460,7,281,54,1460,1460
unique,5,2,2,4,4,2,5,3,25,9,...,6,3,5,5,3,3,4,4,9,6
top,RL,Pave,Grvl,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,...,Attchd,Unf,TA,TA,Y,Gd,MnPrv,Shed,WD,Normal
freq,1151,1454,50,925,1311,1459,1052,1382,225,1260,...,870,605,1311,1326,1340,3,157,49,1267,1198


#### Drop Neighbourhood column due to more number of unique values

In [8]:
data_cat.drop('Neighborhood',axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


#### Encoding categorical variables

In [51]:
for i in data_cat.columns:
    temp = pd.get_dummies(data_cat[i],prefix=str(i)+'_')
    data_cat = pd.concat([data_cat,temp],axis=1)
    data_cat.drop(i,axis=1,inplace=True)

In [52]:
data_num = data.select_dtypes(include=['int64','float64'])
# data_num.describe()
df=pd.concat([data_cat,data_num],axis=1)

#### Splitting data into train and test set 

In [53]:
train_x,test_x,train_y,test_y = train_test_split(df.iloc[:,:-1],df.iloc[:,-1],random_state=0)

#### Linear Regression

In [12]:
linear_reg = LinearRegression()
linear_reg.fit(train_x,train_y)
linear_reg.score(test_x,test_y)

-58738801.44414648

#### Decision Tree 

In [13]:
dtreg = tree.DecisionTreeRegressor()
dtreg.fit(train_x,train_y)
dtreg.score(test_x,test_y)

0.686405246356532

#### Random Forest

In [14]:
rfclass = ensemble.RandomForestRegressor()
rfclass.fit(train_x,train_y)
rfclass.score(test_x,test_y)



0.8278960213204741

#### XGBoost

In [19]:
import xgboost as xgb
model = xgb.XGBRegressor()
model.fit(train_x,train_y)
model.score(test_x,test_y)

0.886605843132648

In [33]:
from sklearn.metrics import mean_squared_error
import numpy as np
pred=model.predict(test_x)
np.sqrt(mean_squared_error(pred,test_y))

16204.24691780822

#### Plot actual vs predicted data

In [7]:
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_subplot(111)
ax.set(title="Actual vs Predicted")
plt.plot(x=range(len(test_y)),y=test_y)
plt.plot(x=range(len(test_y)),y=pred)
plt.legend(['Actual','Predicted'])