In [1]:
# Import all necessary libraries and modules
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import matplotlib
# Sckit-learn ML libraries for python used for training and testing the model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import ShuffleSplit, cross_val_score
# for exporting the model into binary format
import pickle
# for setting header info into JSON structure that can be used later
import json

In [2]:
# setting matplotlib figure size
matplotlib.rcParams['figure.figsize'] = (20,10)

In [3]:
# Read real estate data
data_frame = pd.read_csv(r"estate_dataset.csv")
data_frame.head()

Unnamed: 0,cijena,stanje,lokacija,broj_kvadrata,sprat,namjesten,broj_soba,grijanje
0,35000.0,0,Novo Sarajevo,40.0,-1.0,0.0,1.0,1.0
1,180000.0,0,Opcina Centar,65.0,-1.0,0.0,2.0,1.0
2,73000.0,0,Novo Sarajevo,38.0,-1.0,0.0,2.0,2.0
3,88000.0,0,Novo Sarajevo,68.0,-1.0,0.0,3.0,2.0
4,95000.0,0,Novo Sarajevo,68.0,-1.0,0.0,3.0,2.0


In [4]:
# We have 2238 listings with 8 features
print(f"Shape of our data frame: {data_frame.shape}")

Shape of our data frame: (2238, 8)


# Data Preprocessing

In [5]:
# check for NAs
print("\n# NA values in dataset:")
print(data_frame.isnull().sum())
# drop those NAs
data_frame = data_frame.dropna()


# NA values in dataset:
cijena           2
stanje           0
lokacija         0
broj_kvadrata    2
sprat            4
namjesten        2
broj_soba        4
grijanje         3
dtype: int64


In [6]:
# Displaying some of the insights from data
print("\n# Locations and listings by location: ")
location_stats = data_frame.groupby('lokacija')['lokacija'].agg('count').sort_values(ascending=False)
print(location_stats)
print("\n# Types of heating system and number of listings using it: ")
heating_stats = data_frame.groupby('grijanje')['grijanje'].agg('count').sort_values(ascending=False)
print(heating_stats)


# Locations and listings by location: 
lokacija
Opcina Centar    606
Novi Grad        492
Ilidza           442
Novo Sarajevo    376
Stari Grad       202
Vogosca           90
Hadzici           14
Trnovo             7
Ilijas             5
Name: lokacija, dtype: int64

# Types of heating system and number of listings using it: 
grijanje
1.0    1003
2.0     689
3.0     240
4.0     233
5.0      69
Name: grijanje, dtype: int64


In [7]:
# Displaying listings below price of 10 000 KM
print("\n# Properties for rent: ")
print(data_frame[data_frame.cijena < 10000])
# Removing rental properties and properties that are too expensive to be classified as residental
data_frame = data_frame[~(data_frame.cijena < 10000)]
data_frame = data_frame[~(data_frame.cijena > 500000)]


# Properties for rent: 
      cijena  stanje       lokacija  broj_kvadrata  sprat  namjesten  \
1430  1200.0       0  Opcina Centar           89.0    1.0        1.0   
2027  1000.0       0      Novi Grad           74.0    5.0        1.0   
2162   500.0       0         Ilidza           88.0    9.0        1.0   

      broj_soba  grijanje  
1430        4.0       1.0  
2027        3.0       2.0  
2162        4.0       3.0  


In [8]:
# Putting locations that contain less than 50 listings in Other catogry
# They all will be marked as Other
other_locations = location_stats[location_stats <= 50]
print(f"Locations that we will group as 'Other': {other_locations}")

Locations that we will group as 'Other': lokacija
Hadzici    14
Trnovo      7
Ilijas      5
Name: lokacija, dtype: int64


In [9]:
# Go through data frame and check every location for every listing with lambda function if this location is in other locations
# If yes, replace previous location with "Other"
data_frame.lokacija = data_frame.lokacija.apply(lambda this_location: 'Ostalo' if this_location in other_locations else this_location)
print(f"\n# New unique locations: {data_frame.lokacija.unique()}")


# New unique locations: ['Novo Sarajevo' 'Opcina Centar' 'Stari Grad' 'Novi Grad' 'Vogosca'
 'Ilidza' 'Ostalo']


In [10]:
# Create new column (price/m2) and calculate for every listing
data_frame['cijena_po_m2'] = data_frame['cijena']/data_frame['broj_kvadrata']
print(f"\n# Data frame with new column (price/m2): \n {data_frame.head()}")

print(f"\n# Description of price/m2:\n{data_frame['cijena_po_m2'].describe()}")


# Data frame with new column (price/m2): 
      cijena  stanje       lokacija  broj_kvadrata  sprat  namjesten  \
0   35000.0       0  Novo Sarajevo           40.0   -1.0        0.0   
1  180000.0       0  Opcina Centar           65.0   -1.0        0.0   
2   73000.0       0  Novo Sarajevo           38.0   -1.0        0.0   
3   88000.0       0  Novo Sarajevo           68.0   -1.0        0.0   
4   95000.0       0  Novo Sarajevo           68.0   -1.0        0.0   

   broj_soba  grijanje  cijena_po_m2  
0        1.0       1.0    875.000000  
1        2.0       1.0   2769.230769  
2        2.0       2.0   1921.052632  
3        3.0       2.0   1294.117647  
4        3.0       2.0   1397.058824  

# Description of price/m2:
count    2200.000000
mean     2414.307087
std       705.806065
min       446.735395
25%      1916.666667
50%      2323.055028
75%      2818.993506
max      5697.674419
Name: cijena_po_m2, dtype: float64


In [11]:
# 1 standard deviation
# Group by each location and caluclate mean and standard deviation for each location
def price_per_msq_outlier(df):
    df_out = pd.DataFrame()
    for key, group_df in df.groupby('lokacija'):
        df_mean = np.mean(group_df.cijena_po_m2)
        df_stdev = np.std(group_df.cijena_po_m2)
        reduced_df = group_df[(group_df.cijena_po_m2 > (df_mean - df_stdev)) & (group_df.cijena_po_m2 <= (df_mean + df_stdev))]
        df_out = pd.concat([df_out, reduced_df], ignore_index=True)
    return df_out

In [12]:
data_frame = price_per_msq_outlier(data_frame)
print(f"New shape of data: {data_frame.shape}")

New shape of data: (1598, 9)


In [13]:
# Drop price per m2
data_frame = data_frame.drop(['cijena_po_m2'], axis='columns')

In [14]:
# Hot encode locations
dummies = pd.get_dummies(data_frame.lokacija)
print(f"\n# Dummies:\n{dummies.head()}")


# Dummies:
   Ilidza  Novi Grad  Novo Sarajevo  Opcina Centar  Ostalo  Stari Grad  \
0       1          0              0              0       0           0   
1       1          0              0              0       0           0   
2       1          0              0              0       0           0   
3       1          0              0              0       0           0   
4       1          0              0              0       0           0   

   Vogosca  
0        0  
1        0  
2        0  
3        0  
4        0  


In [15]:
# Add them to our data frame and drop location column with strings
data_frame = pd.concat([data_frame, dummies], axis='columns')
data_frame = data_frame.drop(['lokacija'], axis='columns')
print(f"\n# Data frame ready:\n{data_frame.head()}")


# Data frame ready:
     cijena  stanje  broj_kvadrata  sprat  namjesten  broj_soba  grijanje  \
0  106000.0       0           51.0    0.0        0.0        2.0       1.0   
1  113351.0       0           54.0    0.0        0.0        3.0       1.0   
2  117562.0       1           68.0    0.0        0.0        3.0       1.0   
3  169000.0       0           83.0    0.0        0.0        4.0       1.0   
4  190210.5       1           88.0    0.0        0.0        4.0       1.0   

   Ilidza  Novi Grad  Novo Sarajevo  Opcina Centar  Ostalo  Stari Grad  \
0       1          0              0              0       0           0   
1       1          0              0              0       0           0   
2       1          0              0              0       0           0   
3       1          0              0              0       0           0   
4       1          0              0              0       0           0   

   Vogosca  
0        0  
1        0  
2        0  
3        0  
4     

In [16]:
# Define independent variables
X = data_frame.drop('cijena', axis='columns')
print(f"\n# Independent variables:\n{X.head()}")


# Independent variables:
   stanje  broj_kvadrata  sprat  namjesten  broj_soba  grijanje  Ilidza  \
0       0           51.0    0.0        0.0        2.0       1.0       1   
1       0           54.0    0.0        0.0        3.0       1.0       1   
2       1           68.0    0.0        0.0        3.0       1.0       1   
3       0           83.0    0.0        0.0        4.0       1.0       1   
4       1           88.0    0.0        0.0        4.0       1.0       1   

   Novi Grad  Novo Sarajevo  Opcina Centar  Ostalo  Stari Grad  Vogosca  
0          0              0              0       0           0        0  
1          0              0              0       0           0        0  
2          0              0              0       0           0        0  
3          0              0              0       0           0        0  
4          0              0              0       0           0        0  


In [17]:
# Define dependant variables
y = data_frame.cijena
print(f"\n# Dependent variables:\n{y.head()}")


# Dependent variables:
0    106000.0
1    113351.0
2    117562.0
3    169000.0
4    190210.5
Name: cijena, dtype: float64


In [18]:
# Independent variables, dependent variable, test size set to 20%, 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

In [19]:
# init the model and fit it according to our training data
estate_model = LinearRegression()
estate_model.fit(X_train, y_train)

LinearRegression()

In [20]:
# Evaluate the performance
print(f"Model score: {estate_model.score(X_test, y_test)}")

Model score: 0.902984556195822


In [21]:
# Cross validation of algorithm
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
print(cross_val_score(LinearRegression(), X, y, cv=cv))

[0.89245862 0.89085821 0.88753011 0.89673202 0.89669454 0.89953088
 0.89378961 0.89318804 0.88669317 0.90325162]


In [22]:
# Function that predicts price based on info we provide
def predict_price(mymodel, lokacija, stanje, broj_kvadrata, sprat, namjesten, broj_soba, grijanje):    
    loc_index = np.where(X.columns == lokacija)[0][0]

    x = np.zeros(len(X.columns))
    x[0] = stanje
    x[1] = broj_kvadrata
    x[2] = sprat
    x[3] = namjesten
    x[4] = broj_soba
    x[5] = grijanje
    if loc_index >= 0:
        x[loc_index] = 1

    return mymodel.predict([x])[0]

In [23]:
# Novo Sarajevo, koristen stan, 52 kvadrata, 3 sprat, nenamjesten, 2 sobe, grijanje na plin
print(f"Prediction: {predict_price(estate_model, 'Novo Sarajevo', 0, 52, 3, 0, 2, 1)}")

Prediction: 124975.98445784647


In [None]:
print("\n\n Export model?\n")
exp_choice = str(input("Type 'y' or 'yes' to export or 'n' or 'no' to skip: "))



You can export model if you're satisfied with performance.



In [None]:
if exp_choice == 'y' or exp_choice == 'yes':

    # Export the model into binary pickle file
    with open('estateai_model_v1.pickle','wb') as f:
        pickle.dump(estate_model,f)

    # Storing header information so we can use it later
    columns = {
        'podaci_kolona' : [col.lower() for col in X.columns]
    }
    with open("columns.json","w") as f:
        f.write(json.dumps(columns))

elif exp_choice == 'n' or exp_choice == 'no':
    pass
else:
    print("Wrong! Try Again.!")
    pass