In [60]:
import  numpy as np
import pandas as pd
from pprint import pprint # To output the tree in a nice format

In [61]:
# Load in the datasets
df_train = pd.read_csv('housing_price_train.csv')
test = pd.read_csv('housing_price_test.csv')

### Preprocessing

#### Information of the datasets

### Cleaning the Dataset

#### Problems found in the datasets
1. The ID column is irrelevant for decision making
2. NaN values in many columns.
3. Some columns(e.g: 'PoolQC','MiscFeature') seems to have almost no data compared to the size of the dataset.
    - Might make the decision tree inaccurate.


In [62]:
# Drop ID column in df_train and test because it is not relevant to making the Decisions
# Dropping 'PoolQC', 'Fence' and 'MiscFeature' columns because they have too little data.
#     - Need to be discussed

df_train = df_train.drop(columns=['Id'])
df_train = df_train.drop(columns=['PoolQC', 'Fence', 'MiscFeature'])

#test = test.drop(columns=['Id'])
#test = test.drop(columns=['PoolQC', 'Fence', 'MiscFeature'])

In [63]:
df_train = pd.read_csv('housing_price_train.csv')
df_test = pd.read_csv('housing_price_test.csv')
df_train["LotFrontage"] = df_train.groupby("Neighborhood")["LotFrontage"].transform(
    lambda x: x.fillna(x.median()))
for col in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageCond'):
    df_train[col] = df_train[col].fillna('None')
for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
    df_train[col] = df_train[col].fillna(0)
for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'):
    df_train[col] = df_train[col].fillna(0)
for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
    df_train[col] = df_train[col].fillna('None')
df_train["MasVnrType"] = df_train["MasVnrType"].fillna("None")
df_train["MasVnrArea"] = df_train["MasVnrArea"].fillna(0)
df_train['MSZoning'] = df_train['MSZoning'].fillna(df_train['MSZoning'].mode()[0])
# df_train = df_train.drop(['Utilities'], axis=1)
df_train["Functional"] = df_train["Functional"].fillna("Typ")
df_train['Electrical'] = df_train['Electrical'].fillna(df_train['Electrical'].mode()[0])
df_train['KitchenQual'] = df_train['KitchenQual'].fillna(df_train['KitchenQual'].mode()[0])
df_train['Exterior1st'] = df_train['Exterior1st'].fillna(df_train['Exterior1st'].mode()[0])
df_train['Exterior2nd'] = df_train['Exterior2nd'].fillna(df_train['Exterior2nd'].mode()[0])
df_train['SaleType'] = df_train['SaleType'].fillna(df_train['SaleType'].mode()[0])
df_train['MSSubClass'] = df_train['MSSubClass'].fillna("None")
df_train.drop(['PoolQC','MiscFeature','Alley','Fence','FireplaceQu', 'FullBath'], axis=1, inplace=True)
# test.drop(['PoolQC','MiscFeature','Alley','Fence','FireplaceQu'], axis=1, inplace=True)

#### After Cleaning Dataset info

In [64]:
df_train.isnull().sum()

Id               0
MSSubClass       0
MSZoning         0
LotFrontage      0
LotArea          0
Street           0
LotShape         0
LandContour      0
Utilities        0
LotConfig        0
LandSlope        0
Neighborhood     0
Condition1       0
Condition2       0
BldgType         0
HouseStyle       0
OverallQual      0
OverallCond      0
YearBuilt        0
YearRemodAdd     0
RoofStyle        0
RoofMatl         0
Exterior1st      0
Exterior2nd      0
MasVnrType       0
MasVnrArea       0
ExterQual        0
ExterCond        0
Foundation       0
BsmtQual         0
                ..
GrLivArea        0
BsmtFullBath     0
BsmtHalfBath     0
HalfBath         0
BedroomAbvGr     0
KitchenAbvGr     0
KitchenQual      0
TotRmsAbvGrd     0
Functional       0
Fireplaces       0
GarageType       0
GarageYrBlt      0
GarageFinish     0
GarageCars       0
GarageArea       0
GarageQual       0
GarageCond       0
PavedDrive       0
WoodDeckSF       0
OpenPorchSF      0
EnclosedPorch    0
3SsnPorch   

In [65]:
test.isnull().sum()

Id                  0
MSSubClass          0
MSZoning            4
LotFrontage       237
LotArea             0
Street              0
Alley            1443
LotShape            0
LandContour         0
Utilities           2
LotConfig           0
LandSlope           0
Neighborhood        0
Condition1          0
Condition2          0
BldgType            0
HouseStyle          0
OverallQual         0
OverallCond         0
YearBuilt           0
YearRemodAdd        0
RoofStyle           0
RoofMatl            0
Exterior1st         1
Exterior2nd         1
MasVnrType         16
MasVnrArea         15
ExterQual           0
ExterCond           0
Foundation          0
                 ... 
HalfBath            0
BedroomAbvGr        0
KitchenAbvGr        0
KitchenQual         1
TotRmsAbvGrd        0
Functional          2
Fireplaces          0
FireplaceQu       780
GarageType         80
GarageYrBlt        82
GarageFinish       82
GarageCars          1
GarageArea          1
GarageQual         82
GarageCond

In [79]:
std_sale = np.std(df_train.SalePrice)
size = df_train.shape[0]

In [80]:
def compute_sdr(df_train):
    SDR = {}

    for col in df_train:

        x = []
        if df_train[col].dtypes == 'O':
#             print(col)
            for i in df_train[col].unique():
                df_cat = df_train
                df_cat = df_cat.loc[df_cat[col] == i]
                freq = df_cat.shape[0]
                prob = freq/size
                std_attr = np.std(df_cat.SalePrice)
                x.append(prob*std_attr)
            SDR[col] = std_sale - sum(x)

        if df_train[col].dtypes == 'int64' or df_train[col].dtypes == 'float64':
#             print(col)
            over, less = [],[]
            over = df_train.SalePrice.loc[df_train[col] > df_train[col].median()]
            less = df_train.SalePrice.loc[df_train[col] <= df_train[col].median()]
            std_over = np.std(over)
            std_less = np.std(less)
            p_over = over.count()/size
            p_less = less.count()/size
            result = p_over*std_over + p_less*std_less
            result = std_sale - result
            SDR[col] = result
            
    del SDR['SalePrice']
    SDR = sorted(SDR.items(), key=lambda y: y[1],reverse=True)
    print(SDR[0][0])
    return SDR

Neighborhood


[('Neighborhood', 30795.845574663843),
 ('KitchenQual', 24987.299485683092),
 ('ExterQual', 24830.046605054114),
 ('BsmtQual', 24611.99368787081),
 ('OverallQual', 24169.639457317156),
 ('GarageCars', 18655.977486273157),
 ('GrLivArea', 18442.815198341486),
 ('GarageFinish', 16667.33154062002),
 ('YearBuilt', 14193.29356392668),
 ('GarageArea', 13759.676934338233),
 ('Foundation', 13614.002788923463),
 ('GarageType', 13492.906684791596),
 ('GarageYrBlt', 13174.050373039761),
 ('1stFlrSF', 12427.210763854717),
 ('TotalBsmtSF', 12327.533408498653),
 ('BsmtFinType1', 11596.788683461185),
 ('TotRmsAbvGrd', 11091.35232613662),
 ('HeatingQC', 10834.109114619307),
 ('YearRemodAdd', 10270.809470160559),
 ('OpenPorchSF', 9478.212680369354),
 ('Exterior2nd', 8836.394858898915),
 ('MasVnrType', 8797.947762402619),
 ('BsmtExposure', 8716.83846630578),
 ('LotArea', 8319.899363360659),
 ('Exterior1st', 8071.210534326776),
 ('MasVnrArea', 6519.980581424956),
 ('SaleType', 6458.825495552912),
 ('SaleC


### Building the Decision Tree

In [96]:
def decision_tree(dataset, size_limit = 40, sd_limit = 4000):
    tree = {
        'attributes': '',
        'type': '', 
        'branch': []
    }
    

    
    std_red = compute_sdr(dataset)
    column = std_red[0][0]

    if (std_red[1][1] <= sd_limit or dataset.shape[0] < size_limit): 
            
        return np.mean(dataset['SalePrice'])
    

    elif dataset[column].dtypes == 'O':
        unique_values = dataset[column].unique()
        
       
        tree['attributes'] = column
        tree['type'] = 'obj'
        
    
        
        for val in unique_values:
            br_data = dataset.loc[dataset[column] == val].copy()
            br_data = br_data.drop(columns=[column])
            
            tree['branch'].append({
                'condition': val,
                'branch': decision_tree(br_data)
            })
            
    
    else:
        limit = 0
        br_data = [0,0]
        
        if(dataset[column].median() == dataset[column].min()):
            limit = (dataset[column].min() + dataset[column].max())/2
            
            
        else:
            limit = dataset[column].median()
            
        
        
        br_data_0 = dataset.loc[dataset[column] <= limit].copy()
        br_data_0 = br_data_0.drop(columns=[column])
        
        br_data_1 = dataset.loc[dataset[column] > limit].copy()
        br_data_1 = br_data_1.drop(columns=[column])
        
        #print("Left :", br_data_0.shape, "Right :", br_data_1.shape)
        
        
        tree['attributes'] = column
        tree['type'] = 'num'
        tree['limit'] = limit
        
        #print(tree)
        
        tree['branch'] = [
            decision_tree(br_data_0),
            decision_tree(br_data_0)
        ]
    
    return tree

tree = decision_tree(df_train)

Neighborhood
BsmtQual
OverallQual
Foundation
MSZoning
MasVnrType
LotConfig
MSSubClass
HouseStyle
MSSubClass
MSSubClass
Id
MSSubClass
HouseStyle
Foundation
MSZoning
MasVnrType
LotConfig
MSSubClass
HouseStyle
MSSubClass
MSSubClass
Id
MSSubClass
HouseStyle
LotConfig
OverallQual
Exterior1st
Exterior1st
MSSubClass
MSSubClass
Exterior2nd
Id
Id
BsmtQual
Id
Id
MasVnrType
Id
MSSubClass
OverallQual
Exterior1st
Exterior1st
OverallQual
Exterior1st
Exterior1st
Exterior1st
LotConfig
BsmtExposure
OverallQual
Id
Id
Id
LotArea
MSSubClass
LotConfig
LotConfig
MSSubClass
LotConfig
LotConfig
YearRemodAdd
Condition1
Condition1
GarageQual
BsmtFinType1
HouseStyle
Exterior1st
Foundation
BsmtFinType1
MasVnrArea
BsmtFinType1
Id
Id
Id
Id
Id
Id
Id
Id
Id
HouseStyle
HouseStyle
Exterior1st
Exterior1st
MSSubClass
MSSubClass
Id
BsmtFinType2
Exterior1st
HouseStyle
BsmtFinType1
LotShape
OverallQual
BsmtFinType1
Id
Id
Id
HeatingQC
HouseStyle
MSSubClass
OverallCond
BsmtQual
YearRemodAdd
Exterior1st
Exterior1st
BldgType
Bsm

In [97]:
# Predict 
def predict(row, tree):
    #print(type(tree))
    
    # Recursive Path        
    if type(tree) == float:
       
        return tree 
    
    if tree['type'] == 'num':
        checking_attr = tree['attributes'] 
    
        
        t_value = row.iloc[0][checking_attr]
        
        if(t_value <= tree['limit']):
            left_branch = tree['branch'][0]
            if(type(left_branch) != dict):
                return left_branch
            return predict(row, left_branch)
            
        else:
            right_branch = tree['branch'][1]
            if type(right_branch) != dict:
                return right_branch
            return predict(row, right_branch)
        
    elif tree['type'] == 'obj':
        checking_attr = tree['attributes'] 
        
        
        t_value = row.iloc[0][checking_attr]
    
        
        for branch in tree['branch']:
            if type(branch) != dict:
                return branch
            elif t_value == branch['condition']:
                return predict(row, branch['branch'])
    
    return 'Unknown'

In [98]:
predict(test.loc[[4]], tree)

310499.0

In [100]:
sub = pd.DataFrame()
ids = []
predictions = []

for i, row in test.iterrows():
    ids.append(row['Id'])
    predictions.append(predict(test.iloc[[i]], tree))
    
sub['id'] = ids
sub['SalePrice'] = predictions
sub.to_csv('submission.csv', index=False)
