**Picking the data (columns) for the models**

In [158]:
# Importing required packages and modules
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
from sklearn.preprocessing import OrdinalEncoder



In [159]:
# Set options to show all columns
pd.set_option('display.max_columns', None)

# Load .pkl file and generate pandas dataframe
input_pkl = r'..\data\clean\after_step_3b_outliers_cat.pkl' # Fill your path to file
df_1 = pd.read_pickle(input_pkl)

df_1_shape = df_1.shape # Pass the shape of the dataframe to a variable for summary at end of the outlier detection and removal part
df_1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4180 entries, 0 to 5923
Data columns (total 36 columns):
 #   Column                     Non-Null Count  Dtype   
---  ------                     --------------  -----   
 0   id                         4180 non-null   Int64   
 1   locality_name              4180 non-null   category
 2   Postal_code                4180 non-null   object  
 3   Price                      4180 non-null   Int64   
 4   Subtype                    4180 non-null   object  
 5   Number_of_bedrooms         4180 non-null   Int64   
 6   Living_area                4180 non-null   Int64   
 7   street                     4180 non-null   category
 8   number                     4165 non-null   category
 9   latitude                   4180 non-null   float64 
 10  longitude                  4180 non-null   float64 
 11  Open_fire                  4180 non-null   bool    
 12  Swimming_Pool              4180 non-null   bool    
 13  hasTerrace                 2813 non-nu

**This selection is made based on the correlation matrix in Team_6_Step_4**

In [160]:
selected_columns = ['Price','Number_of_bedrooms','Living_area','Number_of_facades','State_of_building','epc','landSurface','Has_Assigned_City','Province'] 
df = df_1[selected_columns].copy(deep=True)

df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 4180 entries, 0 to 5923
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   Price               4180 non-null   Int64   
 1   Number_of_bedrooms  4180 non-null   Int64   
 2   Living_area         4180 non-null   Int64   
 3   Number_of_facades   4180 non-null   Int64   
 4   State_of_building   4180 non-null   object  
 5   epc                 4180 non-null   object  
 6   landSurface         4180 non-null   Int64   
 7   Has_Assigned_City   4180 non-null   bool    
 8   Province            4180 non-null   category
dtypes: Int64(5), bool(1), category(1), object(2)
memory usage: 289.9+ KB


In [161]:
df.head(30)

Unnamed: 0,Price,Number_of_bedrooms,Living_area,Number_of_facades,State_of_building,epc,landSurface,Has_Assigned_City,Province
0,319000,3,125,4,To renovate,F,767,False,East Flanders
1,299999,3,167,2,Good,D,1050,False,East Flanders
2,275000,3,154,2,To renovate,E,120,False,Antwerp
3,295000,3,172,3,To renovate,F,309,False,Brabant_Wallon
5,715000,3,280,3,As new,C,374,False,Brabant_Wallon
7,198800,3,125,2,To renovate,F,250,False,East Flanders
8,299000,3,132,2,As new,D,145,True,Antwerp
9,469000,3,153,4,To renovate,D,412,False,Antwerp
11,284000,3,148,2,Good,C,119,False,Antwerp
12,339000,3,164,3,Good,A,108,False,West Flanders


**Checking for missing values**

In [162]:
df.isnull().sum()

Price                 0
Number_of_bedrooms    0
Living_area           0
Number_of_facades     0
State_of_building     0
epc                   0
landSurface           0
Has_Assigned_City     0
Province              0
dtype: int64

**Dealing with categorical Features**

In [163]:
categorical_df = df.select_dtypes(include=['category','object'])

categorical_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4180 entries, 0 to 5923
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   State_of_building  4180 non-null   object  
 1   epc                4180 non-null   object  
 2   Province           4180 non-null   category
dtypes: category(1), object(2)
memory usage: 102.1+ KB


**Encoding Province** - get_dummies

In [164]:
province_dummies= pd.get_dummies(categorical_df['Province'], drop_first=True)
province_dummies.head()

Unnamed: 0,Brabant_Wallon,Brussels,East Flanders,Flemish Brabant,Hainaut,Limburg,Liège,Luxembourg,Namur,West Flanders
0,False,False,True,False,False,False,False,False,False,False
1,False,False,True,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False
3,True,False,False,False,False,False,False,False,False,False
5,True,False,False,False,False,False,False,False,False,False


In [165]:
df = pd.concat([df,province_dummies], axis=1)
df= df.drop('Province', axis=1)

**Encoding EPC** - Ordinalencoder

In [166]:
categorical_df['epc'].value_counts()

epc
F    970
C    920
D    743
B    619
E    484
A    444
Name: count, dtype: int64

In [167]:
list_epc = categorical_df['epc'].values.tolist()
unique_epc = list(set(list_epc))
unique_epc.sort(reverse=True)

In [168]:
print(unique_epc)



['F', 'E', 'D', 'C', 'B', 'A']


In [169]:
epc_val = categorical_df[['epc']].values

In [170]:
encoder = OrdinalEncoder(categories=[unique_epc])

In [171]:
#encode and add to the main df
df['Encoded_epc'] = encoder.fit_transform(epc_val)
df= df.drop('epc', axis=1)



***Encoding State_of_building*** - Ordinalencoder

In [172]:
categorical_df['State_of_building'].value_counts()

State_of_building
Good              2044
To renovate        755
As new             723
To be done up      444
Just renovated     214
Name: count, dtype: int64

In [173]:
list_state = categorical_df['State_of_building'].values.tolist()
unique_state = list(set(list_state))
print(unique_state)

['To renovate', 'Just renovated', 'As new', 'Good', 'To be done up']


In [174]:
sort_unique_state = ['To renovate','To be done up','Good', 'Just renovated','As new']

In [175]:
state_val = categorical_df[['State_of_building']].values

In [176]:
encoder = OrdinalEncoder(categories=[sort_unique_state])

In [177]:
df['Encoded_state_of_building'] = encoder.fit_transform(state_val)
df= df.drop('State_of_building', axis=1)

**CENTRING AND SCALING**

In [178]:
# Save data to new csv file

output_csv = r'..\data\clean\model_training.csv'  # Fill your path to file
df.to_csv(output_csv, index=False)


# Save data to new pkl file


output_pkl = r'..\data\clean\model_training.pkl' # Fill your path to file
with open(output_pkl, 'wb') as f:
    pickle.dump(df, f)