In [1]:
import  numpy as np
import pandas as pd
from pprint import pprint # To output the tree in a nice format

In [2]:
# Load in the datasets
training = pd.read_csv('data/housing_price_train.csv')
test = pd.read_csv('data/housing_price_test.csv')

### Preprocessing

#### Information of the datasets

In [3]:
# Training Dataset
print(training.info(), '\n\n')
print("Unique Data Types:",training.dtypes.unique())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
Id               1460 non-null int64
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            91 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-n

In [4]:
# Test Dataset
print(test.info(), '\n\n')
print("Unique Data Types:",test.dtypes.unique())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1560 entries, 0 to 1559
Data columns (total 80 columns):
Id               1560 non-null int64
MSSubClass       1560 non-null int64
MSZoning         1556 non-null object
LotFrontage      1323 non-null float64
LotArea          1560 non-null int64
Street           1560 non-null object
Alley            117 non-null object
LotShape         1560 non-null object
LandContour      1560 non-null object
Utilities        1558 non-null object
LotConfig        1560 non-null object
LandSlope        1560 non-null object
Neighborhood     1560 non-null object
Condition1       1560 non-null object
Condition2       1560 non-null object
BldgType         1560 non-null object
HouseStyle       1560 non-null object
OverallQual      1560 non-null int64
OverallCond      1560 non-null int64
YearBuilt        1560 non-null int64
YearRemodAdd     1560 non-null int64
RoofStyle        1560 non-null object
RoofMatl         1560 non-null object
Exterior1st      1559 non-

### Cleaning the Dataset

#### Problems found in the datasets
1. The ID column is irrelevant for decision making
2. NaN values in many columns.
3. Some columns(e.g: 'PoolQC','MiscFeature') seems to have almost no data compared to the size of the dataset.
    - Might make the decision tree inaccurate.


In [5]:
# Drop ID column in training and test because it is not relevant to making the Decisions
# Dropping 'PoolQC', 'Fence' and 'MiscFeature' columns because they have too little data.
#     - Need to be discussed

training = training.drop(columns=['Id'])
training = training.drop(columns=['PoolQC', 'Fence', 'MiscFeature'])

#test = test.drop(columns=['Id'])
#test = test.drop(columns=['PoolQC', 'Fence', 'MiscFeature'])

In [6]:
# Function for replacing Nan Values
def replace_NaN_for(dataset):
    numeric_cols = dataset.loc[:, dataset.columns != 'SalePrice'].select_dtypes(include=['int64','float64']).columns
    non_numeric_cols = dataset.loc[:, dataset.columns != 'SalePrice'].select_dtypes(include=['object']).columns
    
    # if the column is numeric replace empty values with the median
    for col in dataset.loc[:, numeric_cols]:
        dataset[col] = dataset[col].fillna(dataset[col].median())
        
    # if the column is non-numeric replace empty values with the mode
    for col in dataset.loc[:, non_numeric_cols]:
        #print(col, ":", dataset[col].mode()[0], type(dataset[col].mode()[0]))
        dataset[col] = dataset[col].fillna(dataset[col].mode()[0])
        
    return dataset

training = replace_NaN_for(training)
test = replace_NaN_for(test)

#### After Cleaning Dataset info

In [7]:
training.describe()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,56.89726,69.863699,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.117123,443.639726,46.549315,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,42.300571,22.027677,9981.264932,1.382997,1.112799,30.202904,20.645407,180.731373,456.098091,161.319273,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,20.0,60.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,0.0,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,70.0,79.0,11601.5,7.0,6.0,2000.0,2004.0,164.25,712.25,0.0,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,1474.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [8]:
test.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
count,1560.0,1560.0,1560.0,1560.0,1560.0,1560.0,1560.0,1560.0,1560.0,1560.0,...,1560.0,1560.0,1560.0,1560.0,1560.0,1560.0,1560.0,1560.0,1560.0,1560.0
mean,2240.5,57.182692,68.623718,9956.119231,6.089744,5.560256,1971.344231,1983.726282,100.357692,441.667949,...,473.767949,92.945513,47.946795,24.348077,1.816667,16.935897,1.63141,55.782692,6.103205,2007.769231
std,450.477524,42.765174,20.567293,7147.794552,1.428976,1.110431,30.359659,21.134168,177.392138,454.634885,...,216.683306,126.361822,68.619824,67.372439,20.286428,56.37416,29.490533,611.047964,2.727333,1.311841
min,1461.0,20.0,21.0,1470.0,1.0,1.0,1879.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0
25%,1850.75,20.0,60.0,7398.0,5.0,5.0,1953.75,1963.0,0.0,0.0,...,318.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2007.0
50%,2240.5,50.0,68.0,9431.0,6.0,5.0,1973.0,1992.0,0.0,360.0,...,480.0,0.0,28.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0
75%,2630.25,70.0,78.0,11516.25,7.0,6.0,2001.0,2004.0,162.25,747.25,...,576.25,168.0,70.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0
max,3020.0,190.0,200.0,215245.0,10.0,9.0,2010.0,2010.0,1290.0,4010.0,...,1488.0,1424.0,742.0,1012.0,360.0,576.0,800.0,17000.0,12.0,2010.0


### Building the Decision Tree

In [9]:
def build_tree(dataset, size_limit = 50, standard_deviation_limit = 5000):
    node = {
        'attribute': '',
        'type': '', # numeric(num) / non-numeric(obj)
        'branches': []
    }
    
    std_dev_red = {}
    sale_price_std = np.std(dataset['SalePrice'])
    if sale_price_std == 0:
        return np.mean(dataset['SalePrice'])
    
    numeric_cols = dataset.loc[:, dataset.columns != 'SalePrice'].select_dtypes(include=['int64','float64']).columns
    non_numeric_cols = dataset.loc[:, dataset.columns != 'SalePrice'].select_dtypes(include=['object']).columns    
    
    # For all Numeric columns - Split based on Mean or Median
    for col in dataset.loc[:, numeric_cols]:
        threshold = 0
        #if(dataset[col].min() == dataset[col].max()):
            
        if(dataset[col].median() == dataset[col].min() or dataset[col].median() == dataset[col].max()):
            threshold = (dataset[col].min() + dataset[col].max())/2
            #print(col,":", dataset[col].min(), "-", dataset[col].max(), "; Mean:", threshold)
        else:
            threshold = dataset[col].median()
            #print(col,":", dataset[col].min(), "-", dataset[col].max(), "; Median:", threshold)
            
        left = dataset['SalePrice'].loc[dataset[col] <= threshold]
        right = dataset['SalePrice'].loc[dataset[col] > threshold]
        
        left_std = np.std(left)
        right_std = np.std(right)
        
        left_prob = left.count() / dataset.shape[0]
        right_prob = right.count() / dataset.shape[0]
        
        std_dev_red[col] = sale_price_std - (left_prob*left_std + right_prob*right_std)
    
    # For all Non-numeric columns - Split based on Unique Values
    for col in dataset.loc[:, non_numeric_cols]:
        unique_values = dataset[col].unique()
        col_stds = []
        for val in unique_values:
            temp = dataset.loc[dataset[col] == val]
            prob = temp.shape[0] / dataset.shape[0]
            std = np.std(temp['SalePrice'])
            col_stds.append(prob*std)
        
        std_dev_red[col] = sale_price_std - sum(col_stds)

        
    # Find Best splitting column
    top_std_dev_red = sorted(std_dev_red.items(), key=lambda y: y[1], reverse=True)[0]
    
    column = top_std_dev_red[0]
    
    # Recursive Paths
    if (top_std_dev_red[1] <= standard_deviation_limit or dataset.shape[0] < size_limit): # Leaf Node
        #if(top_std_dev_red[1] <= standard_deviation_limit):
            #print("SDR Limit reached, value returned:", np.mean(dataset['SalePrice']))

        #if(dataset.shape[0] < size_limit):
            #print("Size Limit reached, value returned:", np.mean(dataset['SalePrice']))
            
        return np.mean(dataset['SalePrice'])
    
    # If the column is non-numeric
    elif dataset[column].dtypes == 'O':
        unique_values = dataset[column].unique()
        
        #print("\nBranching by:", column, "Type: Obj",
              #"\nSDR Result:",top_std_dev_red[1],
              #"\nUnique Values:", unique_values)
        
        node['attribute'] = column
        node['type'] = 'obj'
        
        #print(node)
        
        for val in unique_values:
            branch_data = dataset.loc[dataset[column] == val].copy()
            branch_data = branch_data.drop(columns=[column])
            
            node['branches'].append({
                'condition': val,
                'branch': build_tree(branch_data)
            })
            
    # Else the column is numeric
    else:
        threshold = 0
        branch_data = [0,0]
        
        if(dataset[column].median() == dataset[column].min()):
            threshold = (dataset[column].min() + dataset[column].max())/2
            
            #print("\nBranching by:", column, "Type: Num",
                  #"\nSDR Result:", top_std_dev_red[1],
                  #"\nValues(Min-Max)", (dataset[column].min(), dataset[column].max()), 
                  #"\nThreshold(Mid):", threshold,
                  #"\nDataset Shape:", dataset.shape,
                  #"\nLeft :", dataset.loc[dataset[column] <= threshold].shape, "Right :", dataset.loc[dataset[column] > threshold].shape
                 #)
        else:
            threshold = dataset[column].median()
            
            #print("\nBranching by:", column, "Type: Num",
                  #"\nSDR Result:", top_std_dev_red[1],
                  #"\nValues(Min-Max)", (dataset[column].min(), dataset[column].max()), 
                  #"\nThreshold(Median):", threshold,
                  #"\nDataset Shape:", dataset.shape,
                 #)
            
        
        
        branch_data_0 = dataset.loc[dataset[column] <= threshold].copy()
        branch_data_0 = branch_data_0.drop(columns=[column])
        
        branch_data_1 = dataset.loc[dataset[column] > threshold].copy()
        branch_data_1 = branch_data_1.drop(columns=[column])
        
        #print("Left :", branch_data_0.shape, "Right :", branch_data_1.shape)
        
        
        node['attribute'] = column
        node['type'] = 'num'
        node['threshold'] = threshold
        
        #print(node)
        
        node['branches'] = [
            build_tree(branch_data_0),
            build_tree(branch_data_0)
        ]
    
    return node

tree = build_tree(training)

In [10]:
# Representation of the Tree
pprint(dict(tree))

{'attribute': 'Neighborhood',
 'branches': [{'branch': {'attribute': 'OverallQual',
                          'branches': [{'attribute': 'LowQualFinSF',
                                        'branches': [{'attribute': 'FullBath',
                                                      'branches': [{'attribute': 'GrLivArea',
                                                                    'branches': [{'attribute': 'YearBuilt',
                                                                                  'branches': [137059.375,
                                                                                               137059.375],
                                                                                  'threshold': 1996.0,
                                                                                  'type': 'num'},
                                                                                 {'attribute': 'YearBuilt',
                                          

In [23]:
# Predict 
def predict(row, tree):
    #print(type(tree))
    
    # Recursive Path        
    if type(tree) == float: # Leaf Node
        #print("-------------------------------------------------------------")
        return tree   # Return Prediction
    
    if tree['type'] == 'num':
        checking_attr = tree['attribute'] # Checking with this Attribute
        #print("Checking with:", checking_attr)
        
        temp_val = row.iloc[0][checking_attr]
        #print(temp_val, type(temp_val))
        if(temp_val <= tree['threshold']):
            left_branch = tree['branches'][0]
            if(type(left_branch) != dict):
                return left_branch
            return predict(row, left_branch)
            
        else:
            right_branch = tree['branches'][1]
            if type(right_branch) != dict:
                return right_branch
            return predict(row, right_branch)
        
    elif tree['type'] == 'obj':
        checking_attr = tree['attribute'] # Checking with this Attribute
        #print("Checking with:", checking_attr)
        
        temp_val = row.iloc[0][checking_attr]
        #print(temp_val, type(temp_val))
        
        for branch in tree['branches']:
            if type(branch) != dict:
                return branch
            elif temp_val == branch['condition']:
                return predict(row, branch['branch'])
    
    return 'Unknown'

In [24]:
predict(test.loc[[4]], tree)

310499.0

In [31]:
sumbission_csv = pd.DataFrame()
ids = []
predictions = []

for i, row in test.iterrows():
    ids.append(row['Id'])
    predictions.append(predict(test.iloc[[i]], tree))
    
sumbission_csv['id'] = ids
sumbission_csv['SalePrice'] = predictions
sumbission_csv.to_csv('sub.csv', index=False)
#print(id, prediction)
