In [85]:
# Demonstrating the use of DecisionTreeRegressor Algorithm
# Importing Required Libraries and Modules
from sklearn.tree import DecisionTreeRegressor
import pandas as pd
from sklearn import tree
from sklearn.model_selection import train_test_split
import graphviz
from sklearn.metrics import mean_absolute_error

# Import the dataset in csv format using pandas
df=pd.read_csv('train.csv')

# Selecting features that will be used for model creation and prediction
predictors=['LotArea','YearBuilt','1stFlrSF','2ndFlrSF','FullBath','BedroomAbvGr','TotRmsAbvGrd']

# Applying random splitting function to obtain different training and testing data
train_data,test_data,train_target,test_target=train_test_split(df[predictors],df['SalePrice'],train_size=0.8)

'''
# To be used when manually splitting the dataset into training and testing data
x=df[predictors].loc[0:1400]
y=df['SalePrice'].loc[0:1400]

'''

# Calling the DecisionTreeRegressor module for modelling
reg=DecisionTreeRegressor()

# Fitting the training data into the model
trained=reg.fit(train_data,train_target)

# Predicting the values using the trained model
prediction=trained.predict(test_data)

print(prediction)
print('Actual Values are :')
print(test_target)

# Creating and displaying graph to visualise training of model
data=tree.export_graphviz(reg, out_file=None, feature_names=predictors, class_names=df.SalePrice, filled=True, rounded=True, special_characters=True)
print(graphviz.Source(data))

# Calculating the Mean Absolute Error in our prediction using the above model
print(mean_absolute_error(test_target,output))



[282922. 217500. 126000. 116000.  90350. 281213. 134900. 189000. 175000.
  80000. 147000. 191000. 167000. 185900. 135000. 254000. 148000. 164990.
 189000. 110000. 128500. 184100. 181000. 205000. 173500. 319000. 204000.
 280000. 333168. 143500. 179900. 178000. 184900. 217500. 179000. 135875.
 451950. 270000. 109500. 135000. 315000. 122500. 128000. 133000. 183000.
  87000. 238000. 266000. 165000. 175500. 185000. 184900. 113000. 157000.
 200500. 136000. 179400. 171750. 187500. 188000. 118500. 215000. 191000.
 159500. 145250. 325300. 138500. 140000. 217500. 279500. 118964. 109500.
 132250. 225000. 213500. 207500. 163000. 182000. 142500. 119000. 254900.
 244600. 200000. 254000. 179000. 187500. 173733. 122000.  72500. 176000.
 135750. 162900. 160000. 119000. 345000. 146000. 129000. 155000. 117000.
 140000. 282922. 370878. 207500. 180500. 190000. 248000. 275500. 185000.
 130000. 207500. 239686.  94500. 197500. 153500. 193000. 213500. 227000.
 128500. 133000. 132500. 129900. 132000. 190000. 17

In [80]:
'''
Underfitting and Overfitting.
This section of code highlights the effect of variable no. of max_leaf_nodes parameter in the DecisiontreeRegressor() function on the Mean Absolute Error Value.
'''

# Importing required modules
from sklearn.tree import DecisionTreeRegressor
import pandas as pd
from sklearn.metrics import mean_absolute_error

# Loading the dataset in csv format
df=pd.read_csv('train.csv')

# Selecting features that will be used for model creation and prediction
predictors=['LotArea','YearBuilt','1stFlrSF','2ndFlrSF','FullBath','BedroomAbvGr','TotRmsAbvGrd']

# Creating a function to calculate the mean absolute error for different nmber of leaf nodes values
def mae(max_leaf_nodes,train_data,train_target,test_data,test_target):
    reg=DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes)
    trained=reg.fit(train_data,train_target)  # Fitting the data into the model
    prediction=trained.predict(test_data)
    mae_score=mean_absolute_error(test_target,prediction)  #  Calculating the mean abolute error for current number of leaf nodes
    return mae_score

for max_leaf_nodes in [5,10,20,40,50,100,500,1000,2500,5000]:
    train_data,test_data,train_target,test_target=train_test_split(df[predictors],df['SalePrice'],train_size=0.8)
    my_mae=mae(max_leaf_nodes,train_data,train_target,test_data,test_target)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))

Max leaf nodes: 5  		 Mean Absolute Error:  37297
Max leaf nodes: 10  		 Mean Absolute Error:  33920
Max leaf nodes: 20  		 Mean Absolute Error:  29673
Max leaf nodes: 40  		 Mean Absolute Error:  27950
Max leaf nodes: 50  		 Mean Absolute Error:  24623
Max leaf nodes: 100  		 Mean Absolute Error:  29934
Max leaf nodes: 500  		 Mean Absolute Error:  31361
Max leaf nodes: 1000  		 Mean Absolute Error:  33244
Max leaf nodes: 2500  		 Mean Absolute Error:  30260
Max leaf nodes: 5000  		 Mean Absolute Error:  30187




In [87]:
'''
Demonstrating the use of Random Forest Algorithm

'''

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

df=pd.read_csv('train.csv')

predictors=['LotArea','YearBuilt','1stFlrSF','2ndFlrSF','FullBath','BedroomAbvGr','TotRmsAbvGrd']

train_data,test_data,train_target,test_target=train_test_split(df[predictors],df['SalePrice'],train_size=0.8)
forest_model=RandomForestRegressor()
trained=forest_model.fit(train_data,train_target)

prediction=trained.predict(test_data)

print(prediction)
mae_score=mean_absolute_error(test_target,prediction)
print(mae_score)

[127460.         194670.         125770.         132010.
 237500.          83690.         135390.         177974.
 496690.4        120600.         164285.         152135.
 135290.         197200.         124430.         363973.3
 144960.         327463.8        259550.         142600.
 381250.         293540.2        107354.3        131243.2
 375306.3        193507.9        105080.         313000.
 221750.         105790.         124000.         256422.8
 186293.2        256529.9        113060.         156970.
 211439.9        122450.         144400.         203810.
 117900.         119400.         233477.         226241.
 142783.33333333 102140.         221378.         427550.
 259990.         119850.         126100.         194604.9
 247450.         162277.5        143143.33333333 185033.2
 270150.         131895.         146985.8        120020.
  70620.         116950.         141195.         157603.2
 133490.         190964.5        217810.         220381.
 169600.         143495.9

