In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer
from sklearn import tree

In [2]:
train_data = pd.read_csv('./train_data.csv') 
y = train_data.PRICE
train_data.describe()

Unnamed: 0,PRICE,BEDROOMS,BATHROOMS,GARAGE,LAND_AREA,FLOOR_AREA,BUILD_YEAR,CBD_DIST,NEAREST_STN_DIST,POSTCODE,LATITUDE,LONGITUDE,NEAREST_SCH_DIST,NEAREST_SCH_RANK
count,23656.0,23656.0,23656.0,21958.0,23656.0,23656.0,21495.0,23656.0,23656.0,23656.0,23656.0,23656.0,23656.0,16050.0
mean,636890.3,3.658142,1.823301,2.196785,2641.845367,183.488333,1989.704582,19727.496069,4515.986811,6088.947497,-31.960326,115.878999,1.812894,72.808411
std,355998.7,0.751589,0.583628,1.360486,16319.059933,71.963579,20.919355,11355.709306,4464.853127,61.388239,0.177393,0.118019,1.731669,40.679776
min,51000.0,1.0,1.0,1.0,61.0,1.0,1868.0,681.0,46.0,6003.0,-32.467416,115.58361,0.070912,1.0
25%,410000.0,3.0,1.0,2.0,502.0,130.0,1978.0,11100.0,1800.0,6038.0,-32.067406,115.789505,0.88582,39.0
50%,535000.0,4.0,2.0,2.0,682.0,172.0,1995.0,17400.0,3200.0,6069.0,-31.932749,115.853882,1.348658,69.5
75%,760000.0,4.0,2.0,2.0,836.0,222.0,2005.0,26500.0,5300.0,6149.0,-31.84468,115.970145,2.094873,105.0
max,2440000.0,10.0,7.0,99.0,999999.0,870.0,2017.0,59400.0,34900.0,6558.0,-31.462553,116.31764,23.254372,139.0


In [3]:
train_data["DATE_SOLD"]

0        09-2018\r
1        02-2019\r
2        06-2015\r
3        07-2018\r
4        11-2016\r
           ...    
23651    02-2016\r
23652    02-2015\r
23653    05-2018\r
23654    03-2018\r
23655    07-2005\r
Name: DATE_SOLD, Length: 23656, dtype: object

In [19]:
features = ["BEDROOMS","BATHROOMS","GARAGE","LAND_AREA","FLOOR_AREA","BUILD_YEAR","CBD_DIST","NEAREST_STN_DIST","LATITUDE","LONGITUDE","NEAREST_SCH_DIST","NEAREST_SCH_RANK"]

X = train_data[features]

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

train_X.GARAGE.fillna(0, inplace=True)
val_X.GARAGE.fillna(0, inplace=True)



train_X["Rank_was_missing"] = train_X["NEAREST_SCH_RANK"].isnull()
val_X["Rank_was_missing"] = val_X["NEAREST_SCH_RANK"].isnull()

train_X.NEAREST_SCH_RANK.fillna(0, inplace=True)
val_X.NEAREST_SCH_RANK.fillna(0, inplace=True)

my_imputer = SimpleImputer(strategy="most_frequent")

imputed_train_X = pd.DataFrame(my_imputer.fit_transform(train_X))
imputed_val_X = pd.DataFrame(my_imputer.transform(val_X))

imputed_train_X.columns = train_X.columns
imputed_val_X.columns = val_X.columns

imputed_train_X.describe()

Unnamed: 0,BEDROOMS,BATHROOMS,GARAGE,LAND_AREA,FLOOR_AREA,BUILD_YEAR,CBD_DIST,NEAREST_STN_DIST,LATITUDE,LONGITUDE,NEAREST_SCH_DIST,NEAREST_SCH_RANK,Rank_was_missing
count,17742.0,17742.0,17742.0,17742.0,17742.0,17742.0,17742.0,17742.0,17742.0,17742.0,17742.0,17742.0,17742.0
mean,3.653083,1.822342,2.037763,2618.025251,183.430053,1990.684703,19736.608274,4550.141923,-31.96008,115.879306,1.823634,49.659227,0.31975
std,0.753315,0.583872,1.472762,15562.153047,72.074137,20.10218,11316.350902,4497.266003,0.177138,0.118261,1.747527,47.714451,0.466393
min,1.0,1.0,0.0,61.0,1.0,1868.0,693.0,46.0,-32.467416,115.58361,0.070912,0.0,0.0
25%,3.0,1.0,2.0,503.0,130.0,1980.0,11200.0,1800.0,-32.067602,115.789625,0.892037,0.0,0.0
50%,4.0,2.0,2.0,683.0,172.5,1998.0,17400.0,3300.0,-31.932883,115.854255,1.350224,40.0,0.0
75%,4.0,2.0,2.0,836.0,222.0,2004.0,26500.0,5300.0,-31.844943,115.97057,2.101619,92.0,1.0
max,10.0,7.0,99.0,999999.0,870.0,2017.0,59400.0,34900.0,-31.462553,116.31764,23.254372,139.0,1.0


In [31]:
results = []
leaf_counts = [300,350,400,450,500,550,600,650,700]
for leafs in leaf_counts:
    model = tree.DecisionTreeRegressor(criterion="absolute_error",random_state=0,max_depth=10,max_leaf_nodes=leafs)
    model.fit(imputed_train_X,train_y)
    val_prediction = model.predict(imputed_val_X)

    MAE_Score = mean_absolute_error(val_y,val_prediction)
    results.append([leafs,MAE_Score])

#print("Validation F1 Score for Random Forest Model: {}".format(MAE_Score))
print(results)

[[300, 118011.33496787284], [350, 117601.84731146433], [400, 117461.54041258032], [450, 117624.21203922895], [500, 117409.19851200542], [550, 117663.45958741968], [600, 117670.35847142374], [650, 117753.56459249239], [700, 117844.88112952317]]


In [27]:
for i in range(30):
    print(i+1,results[i])

print(min(results))

1 214014.10297598917
2 188114.5639161312
3 168093.08572877917
4 156482.05089617855
5 145484.43473114644
6 133414.68921203923
7 127382.73537368955
8 123014.22184646601
9 120354.45840378762
10 116787.21559012512
11 118301.18709841055
12 118482.03703077443
13 119626.17382482246
14 123484.33065607033
15 123968.00811633412
16 126636.2530436253
17 129241.55343253298
18 131810.93329387894
19 131510.19665201218
20 131785.75650997632
21 132683.10492052755
22 134071.4549374366
23 134256.30825160636
24 136306.8900067636
25 135204.12250591817
26 135167.82473791004
27 135110.45857287792
28 137041.34697328374
29 136416.8530605343
30 136404.16723030098
116787.21559012512


In [34]:
X=train_data[features]

X.GARAGE.fillna(0, inplace=True)

X["Rank_was_missing"] = X["NEAREST_SCH_RANK"].isnull()

X.NEAREST_SCH_RANK.fillna(0, inplace=True)

my_imputer_full = SimpleImputer(strategy="most_frequent")

imputed_X = pd.DataFrame(my_imputer.fit_transform(X))

imputed_X.columns = X.columns

imputed_X.describe()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["Rank_was_missing"] = X["NEAREST_SCH_RANK"].isnull()


Unnamed: 0,BEDROOMS,BATHROOMS,GARAGE,LAND_AREA,FLOOR_AREA,BUILD_YEAR,CBD_DIST,NEAREST_STN_DIST,LATITUDE,LONGITUDE,NEAREST_SCH_DIST,NEAREST_SCH_RANK,Rank_was_missing
count,23656.0,23656.0,23656.0,23656.0,23656.0,23656.0,23656.0,23656.0,23656.0,23656.0,23656.0,23656.0,23656.0
mean,3.658142,1.823301,2.039102,2641.845367,183.488333,1990.645079,19727.496069,4515.986811,-31.960326,115.878999,1.812894,49.398673,0.321525
std,0.751589,0.583628,1.428147,16319.059933,71.963579,20.160342,11355.709306,4464.853127,0.177393,0.118019,1.731669,47.741036,0.467072
min,1.0,1.0,0.0,61.0,1.0,1868.0,681.0,46.0,-32.467416,115.58361,0.070912,0.0,0.0
25%,3.0,1.0,2.0,502.0,130.0,1980.0,11100.0,1800.0,-32.067406,115.789505,0.88582,0.0,0.0
50%,4.0,2.0,2.0,682.0,172.0,1998.0,17400.0,3200.0,-31.932749,115.853882,1.348658,39.0,0.0
75%,4.0,2.0,2.0,836.0,222.0,2004.0,26500.0,5300.0,-31.84468,115.970145,2.094873,92.0,1.0
max,10.0,7.0,99.0,999999.0,870.0,2017.0,59400.0,34900.0,-31.462553,116.31764,23.254372,139.0,1.0


In [35]:
model = tree.DecisionTreeRegressor(criterion="absolute_error",random_state=0,max_depth=10,max_leaf_nodes=500)
model.fit(imputed_X,y)

In [37]:
test_data = pd.read_csv('./test_data.csv') 

# OBROBKA WASZYCH DANYCH
test_y = test_data.PRICE
test_X = test_data[features]

test_X.GARAGE.fillna(0, inplace=True)

test_X["Rank_was_missing"] = test_X["NEAREST_SCH_RANK"].isnull()

test_X.NEAREST_SCH_RANK.fillna(0, inplace=True)


imputed_test_X = pd.DataFrame(my_imputer.transform(test_X))

imputed_test_X.columns = test_X.columns
# KONIEC OBROBKI
imputed_test_X.describe()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_X["Rank_was_missing"] = test_X["NEAREST_SCH_RANK"].isnull()


Unnamed: 0,BEDROOMS,BATHROOMS,GARAGE,LAND_AREA,FLOOR_AREA,BUILD_YEAR,CBD_DIST,NEAREST_STN_DIST,LATITUDE,LONGITUDE,NEAREST_SCH_DIST,NEAREST_SCH_RANK,Rank_was_missing
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,3.6614,1.8225,2.0352,2974.3621,183.5328,1990.7336,19895.3668,4540.8407,-31.961462,115.879893,1.820882,48.1383,0.3346
std,0.753131,0.596347,1.44823,17546.167012,72.435286,20.233224,11384.682528,4565.913004,0.178696,0.118418,1.779517,47.530429,0.471874
min,1.0,1.0,0.0,72.0,18.0,1880.0,747.0,46.0,-32.472979,115.58273,0.097609,0.0,0.0
25%,3.0,1.0,2.0,506.0,130.0,1981.0,11300.0,1800.0,-32.072158,115.790347,0.86765,0.0,0.0
50%,4.0,2.0,2.0,682.0,172.0,1998.0,17700.0,3200.0,-31.934325,115.854939,1.335849,39.0,0.0
75%,4.0,2.0,2.0,845.0,223.0,2004.0,26800.0,5400.0,-31.841951,115.971992,2.103752,92.0,1.0
max,8.0,16.0,50.0,999999.0,840.0,2017.0,59800.0,35500.0,-31.45745,116.343201,17.908836,139.0,1.0


In [38]:
#PREDYKCJA MODELU
prediction = model.predict(imputed_test_X)

MAE_Score = mean_absolute_error(test_y,prediction)

print("Validation F1 Score for Random Forest Model: {}".format(MAE_Score))

Validation F1 Score for Random Forest Model: 118424.03755
