In [55]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

### Part 1: Data processing

In [2]:
df = pd.read_csv('./house_sale_data_with_geocodes.csv')

In [3]:
df.head()

Unnamed: 0,address,zip_code,price,sell_date,sell_type,price_per_sq_m,no_rooms,housing_type,size_in_sq_m,year_of_construction,price_change_in_pct,zip_code_num,long,lat
0,Nykøbingvej 23,4460 Snertinge,150000,05-04-2017,Alm. Salg,1209.0,4.0,Villa,124.0,1934.0,0.0,4460,11.383189,55.714074
1,Bakkekammen 5,4460 Snertinge,1145000,24-03-2017,Alm. Salg,9015.0,3.0,Villa,127.0,1971.0,-4.0,4460,11.381936,55.717096
2,Æblekjærvej 11,4460 Snertinge,700000,15-02-2017,Alm. Salg,6666.0,3.0,Villa,105.0,1946.0,-12.0,4460,11.381427,55.71984
3,Bakkekammen 7,4460 Snertinge,920000,30-01-2017,Alm. Salg,8518.0,4.0,Villa,108.0,1966.0,-3.0,4460,11.382017,55.716914
4,Tværvej 4,4460 Snertinge,300000,01-12-2016,Alm. Salg,3061.0,3.0,Villa,98.0,1964.0,0.0,4460,11.381899,55.710771


We are going to make a classifier which can predict if the house price of a house will increase or decrease og stay the same. We are going to use the following features :  

- zip_code_num
- price
- price_per_sq_m
- no_rooms
- housing_type
- size_in_sq_m
- year_of_construction
- long
- lat

In [4]:
df['housing_type'].unique()

array(['Villa', 'Stuehus', 'Rækkehus', 'Lejlighed', 'Døgninstitution',
       'Sommerhus', 'Detailhandel', 'Undervisning o.l.', 'Anden beboelse',
       'Engroshandel', 'Ukendt', 'BygningTilLager', 'Servicevirksomhed',
       'Erhverv', 'Fritid', 'Kolonihavehus', 'Industri o.l.',
       'Dobbelthus', 'BygningTilHandel', 'Kollegium', 'Biograf o.l.',
       'BygningTilKontor', 'Ferie, andet', 'Anden handel/transport',
       'Landbrug o.l.', 'Idrætsudøvelse', 'Daginstitution', 'Landbrug',
       'Hospital', 'AndenBygningTilKontorHandelLager',
       'Anden Institution', 'Museum o.l.', 'Transport, garage',
       'Tankstation', 'Pengeinstitut o.l.', 'Off. Administration'],
      dtype=object)

We change the housing_type to numbers and save it in a new column called housing_type_bin

In [5]:
le = LabelEncoder()
df['housing_type_bin'] = le.fit_transform(df['housing_type'])

In [6]:
df.head()

Unnamed: 0,address,zip_code,price,sell_date,sell_type,price_per_sq_m,no_rooms,housing_type,size_in_sq_m,year_of_construction,price_change_in_pct,zip_code_num,long,lat,housing_type_bin
0,Nykøbingvej 23,4460 Snertinge,150000,05-04-2017,Alm. Salg,1209.0,4.0,Villa,124.0,1934.0,0.0,4460,11.383189,55.714074,35
1,Bakkekammen 5,4460 Snertinge,1145000,24-03-2017,Alm. Salg,9015.0,3.0,Villa,127.0,1971.0,-4.0,4460,11.381936,55.717096,35
2,Æblekjærvej 11,4460 Snertinge,700000,15-02-2017,Alm. Salg,6666.0,3.0,Villa,105.0,1946.0,-12.0,4460,11.381427,55.71984,35
3,Bakkekammen 7,4460 Snertinge,920000,30-01-2017,Alm. Salg,8518.0,4.0,Villa,108.0,1966.0,-3.0,4460,11.382017,55.716914,35
4,Tværvej 4,4460 Snertinge,300000,01-12-2016,Alm. Salg,3061.0,3.0,Villa,98.0,1964.0,0.0,4460,11.381899,55.710771,35


### Part 2: ML model selection and construction

Next step is to generate labels for the data. We have 3 labels : 

- house price will increase = 2
- house price stay the same = 1
- house price will decrease = 0

Let's generate the labels.

In [7]:
def price_change_label(price_change):
    if price_change > 0.0:
        return 2
    elif price_change < 0.0:
        return 0
    else: 
        return 1

In [8]:
df['price_change_label'] = df.apply(lambda x : price_change_label(x['price_change_in_pct']),axis=1)

Lets see if the dataset is balanced.

In [9]:
count_class_1 , count_class_0 , count_class_2 = df.price_change_label.value_counts()
print(count_class_1,count_class_0,count_class_2)

922902 342021 16915


We can see that the dataset is very unbalanced. so we need to balance it

In [28]:
#Lets make a mask for all of the labels.
df_class_0 = df[df['price_change_label'] == 0]
df_class_1 = df[df['price_change_label'] == 1]
df_class_2 = df[df['price_change_label'] == 2]

In [29]:
# now we take 16915 random values from price_change_label where label is 0 and 1.
df_class_0 = df_class_0.sample(count_class_2)
df_class_1 = df_class_1.sample(count_class_2)

df_class_0_50 = df_class_0[:-50]
df_class_0 = df_class_0[:16865]

df_class_1_50 = df_class_1[:-50]
df_class_1 = df_class_1[:16865]

df_class_2_50 = df_class_2[:-50]
df_class_2 = df_class_2[:16865]

# combine all of the new classes to one dataFrame
df_balanced = pd.concat([df_class_0, df_class_1, df_class_2], axis=0)
df_val_data = pd.concat([df_class_0_50, df_class_1_50, df_class_2_50], axis=0)

Below we can see that we have one balanced dataFrame

In [30]:
df_balanced.price_change_label.value_counts()

2    16865
1    16865
0    16865
Name: price_change_label, dtype: int64

Now we can make our feature set

In [13]:
X = df_balanced[['zip_code_num','price','price_per_sq_m','no_rooms','housing_type_bin','size_in_sq_m','year_of_construction',
      'long','lat']]
y = df_balanced.price_change_label

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1,shuffle=True)

In [50]:
model = KNeighborsClassifier(3)
model.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [51]:
y_pred = model.predict(X_test)

Let calculate the acuracy score.

In [52]:
accuracy_score(y_test,y_pred)

0.5355207409597005

In [53]:
model = DecisionTreeClassifier(max_depth=5)
model.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [54]:
y_pred = model.predict(X_test)
accuracy_score(y_test,y_pred)

0.5596610503497882

In [56]:
metrics.confusion_matrix(y_test,y_pred)

array([[1480, 1008,  979],
       [ 696, 2073,  511],
       [ 842,  433, 2127]])

Lets calculate the precesion and recall

In [62]:
metrics.recall_score(y_test,y_pred, average='macro')

0.5613715614165754

In [63]:
metrics.precision_score(y_test,y_pred,average='macro')

0.5561244659951273

### Part 4: Report your findings (BI)