**Picking the data (columns) for the models**

In [267]:
# Importing required packages and modules
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, KFold, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error, r2_score
from scipy.stats import uniform





In [268]:
# Set options to show all columns
pd.set_option('display.max_columns', None)

# Load .pkl file and generate pandas dataframe
input_pkl = r'..\data\clean\after_step_3b_outliers_cat.pkl' # Fill your path to file
df_1 = pd.read_pickle(input_pkl)

df_1_shape = df_1.shape # Pass the shape of the dataframe to a variable for summary at end of the outlier detection and removal part
df_1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4180 entries, 0 to 5923
Data columns (total 36 columns):
 #   Column                     Non-Null Count  Dtype   
---  ------                     --------------  -----   
 0   id                         4180 non-null   Int64   
 1   locality_name              4180 non-null   category
 2   Postal_code                4180 non-null   object  
 3   Price                      4180 non-null   Int64   
 4   Subtype                    4180 non-null   object  
 5   Number_of_bedrooms         4180 non-null   Int64   
 6   Living_area                4180 non-null   Int64   
 7   street                     4180 non-null   category
 8   number                     4165 non-null   category
 9   latitude                   4180 non-null   float64 
 10  longitude                  4180 non-null   float64 
 11  Open_fire                  4180 non-null   bool    
 12  Swimming_Pool              4180 non-null   bool    
 13  hasTerrace                 2813 non-nu

**This selection is made based on the correlation matrix in Team_6_Step_4**

In [269]:
selected_columns = ['Price','Number_of_bedrooms','Living_area','Number_of_facades','State_of_building','epc','landSurface','Has_Assigned_City','Province'] 
df = df_1[selected_columns].copy(deep=True)

df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 4180 entries, 0 to 5923
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   Price               4180 non-null   Int64   
 1   Number_of_bedrooms  4180 non-null   Int64   
 2   Living_area         4180 non-null   Int64   
 3   Number_of_facades   4180 non-null   Int64   
 4   State_of_building   4180 non-null   object  
 5   epc                 4180 non-null   object  
 6   landSurface         4180 non-null   Int64   
 7   Has_Assigned_City   4180 non-null   bool    
 8   Province            4180 non-null   category
dtypes: Int64(5), bool(1), category(1), object(2)
memory usage: 289.9+ KB


In [270]:
df.head(30)

Unnamed: 0,Price,Number_of_bedrooms,Living_area,Number_of_facades,State_of_building,epc,landSurface,Has_Assigned_City,Province
0,319000,3,125,4,To renovate,F,767,True,East Flanders
1,299999,3,167,2,Good,D,1050,True,East Flanders
2,275000,3,154,2,To renovate,E,120,True,Antwerp
3,295000,3,172,3,To renovate,F,309,True,Brabant_Wallon
5,715000,3,280,3,As new,C,374,True,Brabant_Wallon
7,198800,3,125,2,To renovate,F,250,True,East Flanders
8,299000,3,132,2,As new,D,145,False,Antwerp
9,469000,3,153,4,To renovate,D,412,True,Antwerp
11,284000,3,148,2,Good,C,119,True,Antwerp
12,339000,3,164,3,Good,A,108,True,West Flanders


**Checking for missing values**

In [271]:
df.isnull().sum()

Price                 0
Number_of_bedrooms    0
Living_area           0
Number_of_facades     0
State_of_building     0
epc                   0
landSurface           0
Has_Assigned_City     0
Province              0
dtype: int64

**Dealing with binary Features**

In [272]:
df['Has_Assigned_City'] = df['Has_Assigned_City'].astype('Int64')

**Dealing with categorical Features**

In [273]:
categorical_df = df.select_dtypes(include=['category','object'])

categorical_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4180 entries, 0 to 5923
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   State_of_building  4180 non-null   object  
 1   epc                4180 non-null   object  
 2   Province           4180 non-null   category
dtypes: category(1), object(2)
memory usage: 102.1+ KB


**Encoding Province** - get_dummies

In [274]:
province_dummies= pd.get_dummies(categorical_df['Province'], drop_first=True)
province_dummies.head()

Unnamed: 0,Brabant_Wallon,Brussels,East Flanders,Flemish Brabant,Hainaut,Limburg,Liège,Luxembourg,Namur,West Flanders
0,False,False,True,False,False,False,False,False,False,False
1,False,False,True,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False
3,True,False,False,False,False,False,False,False,False,False
5,True,False,False,False,False,False,False,False,False,False


In [275]:
df = pd.concat([df,province_dummies], axis=1)
df= df.drop('Province', axis=1)

**Encoding EPC** - Ordinalencoder

In [276]:
categorical_df['epc'].value_counts()

epc
F    970
C    920
D    743
B    619
E    484
A    444
Name: count, dtype: int64

In [277]:
list_epc = categorical_df['epc'].values.tolist()
unique_epc = list(set(list_epc))
unique_epc.sort(reverse=True)

In [278]:
print(unique_epc)



['F', 'E', 'D', 'C', 'B', 'A']


In [279]:
epc_val = categorical_df[['epc']].values

In [280]:
encoder = OrdinalEncoder(categories=[unique_epc])

In [281]:
#encode and add to the main df
df['Encoded_epc'] = encoder.fit_transform(epc_val)
df= df.drop('epc', axis=1)



**Encoding State_of_building** - Ordinalencoder

In [282]:
categorical_df['State_of_building'].value_counts()

State_of_building
Good              2044
To renovate        755
As new             723
To be done up      444
Just renovated     214
Name: count, dtype: int64

In [283]:
list_state = categorical_df['State_of_building'].values.tolist()
unique_state = list(set(list_state))
print(unique_state)

['As new', 'To be done up', 'Just renovated', 'To renovate', 'Good']


In [284]:
sort_unique_state = ['To renovate','To be done up','Good', 'Just renovated','As new']

In [285]:
state_val = categorical_df[['State_of_building']].values

In [286]:
encoder = OrdinalEncoder(categories=[sort_unique_state])

In [287]:
df['Encoded_state_of_building'] = encoder.fit_transform(state_val)
df= df.drop('State_of_building', axis=1)

In [288]:
df['Encoded_state_of_building'].value_counts()

Encoded_state_of_building
2.0    2044
0.0     755
4.0     723
1.0     444
3.0     214
Name: count, dtype: int64

In [289]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4180 entries, 0 to 5923
Data columns (total 18 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Price                      4180 non-null   Int64  
 1   Number_of_bedrooms         4180 non-null   Int64  
 2   Living_area                4180 non-null   Int64  
 3   Number_of_facades          4180 non-null   Int64  
 4   landSurface                4180 non-null   Int64  
 5   Has_Assigned_City          4180 non-null   Int64  
 6   Brabant_Wallon             4180 non-null   bool   
 7   Brussels                   4180 non-null   bool   
 8   East Flanders              4180 non-null   bool   
 9   Flemish Brabant            4180 non-null   bool   
 10  Hainaut                    4180 non-null   bool   
 11  Limburg                    4180 non-null   bool   
 12  Liège                      4180 non-null   bool   
 13  Luxembourg                 4180 non-null   bool   
 1

In [290]:
df.isnull().sum()


Price                        0
Number_of_bedrooms           0
Living_area                  0
Number_of_facades            0
landSurface                  0
Has_Assigned_City            0
Brabant_Wallon               0
Brussels                     0
East Flanders                0
Flemish Brabant              0
Hainaut                      0
Limburg                      0
Liège                        0
Luxembourg                   0
Namur                        0
West Flanders                0
Encoded_epc                  0
Encoded_state_of_building    0
dtype: int64

In [291]:
df.var()

Price                        17070200689.827194
Number_of_bedrooms                     0.595333
Living_area                         2263.901476
Number_of_facades                      0.487711
landSurface                        53069.195301
Has_Assigned_City                      0.193725
Brabant_Wallon                         0.085976
Brussels                               0.021301
East Flanders                           0.21333
Flemish Brabant                        0.045785
Hainaut                                0.017163
Limburg                                0.017163
Liège                                  0.072329
Luxembourg                             0.053072
Namur                                  0.019467
West Flanders                          0.159463
Encoded_epc                             2.74806
Encoded_state_of_building              1.567198
dtype: Float64

**Splitting the dataset**

In [292]:
X = df.drop('Price', axis = 1)
y = df['Price'].values
print(type(X), type(y))

<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.arrays.integer.IntegerArray'>


In [293]:
print(X.shape)
print(y.shape)

(4180, 17)
(4180,)


**Splitting in train and test data**

In [294]:

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [295]:
#testing a first regression model (LinearRegression) without scaling

reg_all = LinearRegression()
reg_all.fit(X_train, y_train)
y_pred = reg_all.predict(X_test)

In [296]:
reg_all.score(X_test,y_test)

0.683274149593218

In [297]:
print(root_mean_squared_error(y_test, y_pred))
print(r2_score(y_test, y_pred))


73001.16548609865
0.683274149593218


**CENTRING AND SCALING**

In [298]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
print(np.mean(X), np.std(X))
print(np.mean(X_train), np.std(X_train))

34.430410920349004 Number_of_bedrooms             0.771486
Living_area                   47.574782
Number_of_facades               0.69828
landSurface                  230.339965
Has_Assigned_City               0.44009
Brabant_Wallon                 0.293182
Brussels                       0.145933
East Flanders                  0.461822
Flemish Brabant                0.213948
Hainaut                        0.130993
Limburg                        0.130993
Liège                          0.268908
Luxembourg                     0.230346
Namur                          0.139507
West Flanders                  0.399281
Encoded_epc                    1.657529
Encoded_state_of_building      1.251728
dtype: Float64
3.7496978034061014e-18 1.0


  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)


**REGRESSION MODELS**

**Linear Regression**

In [299]:
reg = LinearRegression()

*Cross-validation*

In [300]:
#cross-validation peformance
kf = KFold(n_splits=6, shuffle=True, random_state=42)
cv_results = cross_val_score(reg, X_train, y_train, cv=kf)
print(cv_results)
print(np.mean(cv_results), np.std(cv_results))
print(np.quantile(cv_results,[0.025,0.075]))

[0.64679897 0.60293149 0.63175918 0.64311691 0.62836449 0.63375199]
0.6311205026179939 0.014137874174133109
[0.60611061 0.61246886]


*Train the Model on the Training Set*

In [301]:
reg.fit(X_train, y_train)


In [302]:
reg.score(X_train,y_train)

0.6365859283049424

*Evaluate Model on the Test Set*

In [303]:
y_pred = reg.predict(X_test)
reg.score(X_test,y_test)

0.6832741495932182

In [304]:
print(root_mean_squared_error(y_test, y_pred))
print(r2_score(y_test, y_pred))

73001.1654860986
0.6832741495932182


In [305]:
feature_weights = pd.DataFrame({
    'Feature': X.columns,
    'Weight': reg.coef_
})


print(feature_weights)

                      Feature        Weight
0          Number_of_bedrooms   8162.598878
1                 Living_area  35967.400947
2           Number_of_facades  22312.086091
3                 landSurface  26257.587449
4           Has_Assigned_City -15957.855517
5              Brabant_Wallon   5837.860957
6                    Brussels  24535.483780
7               East Flanders -21953.142564
8             Flemish Brabant   -947.149980
9                     Hainaut -17211.165612
10                    Limburg -10276.532132
11                      Liège -27801.103780
12                 Luxembourg -21356.894184
13                      Namur -17891.287870
14              West Flanders -31079.064750
15                Encoded_epc  47164.980902
16  Encoded_state_of_building  18767.889616


**Ridge**

*Cross-validation*

In [306]:
ridge = Ridge(alpha=1.0)

In [307]:
#cross-validation peformance
kf = KFold(n_splits=6, shuffle=True, random_state=42)
cv_results = cross_val_score(ridge, X_train, y_train, cv=kf)
print(cv_results)
print(np.mean(cv_results), np.std(cv_results))
print(np.quantile(cv_results,[0.025,0.075]))

[0.64680784 0.60296008 0.63175851 0.64311237 0.62837503 0.63372313]
0.6311228268770487 0.014128133852073083
[0.60613695 0.61249069]


*Train the Model on the Training Set*

In [308]:
ridge.fit(X_train, y_train)


In [309]:
ridge.score(X_train,y_train)

0.6365858375963593

*Evaluate Model on the Test Set*

In [310]:
y_pred = ridge.predict(X_test)
ridge.score(X_test,y_test)

0.6832895043063877

In [311]:
print(root_mean_squared_error(y_test, y_pred))
print(r2_score(y_test, y_pred))

72999.39593438491
0.6832895043063877


*Hyper parameter tuning*

In [312]:
param_grid = {'alpha': np.linspace(0.0001,1,10), 'solver':['auto','svd','cholesky','lsqr','sparse_cg','sag','saga']}
rscv = RandomizedSearchCV(ridge, param_grid, cv = kf, n_iter=2)
rscv.fit(X_train, y_train)
print("Best Parameters:", rscv.best_params_)
print("Best Cross-Validation Score:", rscv.best_score_)

Best Parameters: {'solver': 'sag', 'alpha': np.float64(0.11120000000000001)}
Best Cross-Validation Score: 0.6311241199381186


In [313]:
best_model = rscv.best_estimator_


In [314]:
best_model.score(X_train,y_train)

0.6365859176664033

In [315]:
y_pred = best_model.predict(X_test)
best_model.score(X_test,y_test)

0.6832767613601525

In [316]:
print(root_mean_squared_error(y_test, y_pred))
print(r2_score(y_test, y_pred))

73000.86449642095
0.6832767613601525


In [317]:
feature_weights = pd.DataFrame({
    'Feature': X.columns,
    'Weight': best_model.coef_
})


print(feature_weights)

                      Feature        Weight
0          Number_of_bedrooms   8162.055999
1                 Living_area  35969.218537
2           Number_of_facades  22307.945794
3                 landSurface  26256.119773
4           Has_Assigned_City -15958.649864
5              Brabant_Wallon   5841.136930
6                    Brussels  24533.728461
7               East Flanders -21948.680897
8             Flemish Brabant   -945.167907
9                     Hainaut -17208.270622
10                    Limburg -10274.164712
11                      Liège -27794.836382
12                 Luxembourg -21351.301170
13                      Namur -17898.790793
14              West Flanders -31071.757409
15                Encoded_epc  47162.835723
16  Encoded_state_of_building  18765.013878


**Lasso**

In [318]:
lasso = Lasso(alpha=1)

*Cross-validation*

In [319]:
#cross-validation peformance
kf = KFold(n_splits=6, shuffle=True, random_state=42)
cv_results = cross_val_score(lasso, X_train, y_train, cv=kf)
print(cv_results)
print(np.mean(cv_results), np.std(cv_results))
print(np.quantile(cv_results,[0.025,0.075]))

[0.64680026 0.60293391 0.63175711 0.64311699 0.62836766 0.63374946]
0.6311208986311619 0.014137123327055513
[0.60611313 0.61247156]


*Train the Model on the Training Set*

In [320]:
lasso.fit(X_train, y_train)


In [321]:
lasso.score(X_train,y_train)

0.6365859263929415

*Evaluate Model on the Test Set*

In [322]:
y_pred = lasso.predict(X_test)
lasso.score(X_test,y_test)

0.6832752261226365

In [323]:
print(root_mean_squared_error(y_test, y_pred))
print(r2_score(y_test, y_pred))

73001.04142301869
0.6832752261226365


*Hyper parameter tuning*

In [324]:
param_grid = {'alpha': np.linspace(0.0001, 1, 10)}
rscv = RandomizedSearchCV(lasso, param_distributions=param_grid, cv=kf, n_iter=10, random_state=42)
rscv.fit(X_train, y_train)
print("Best Parameters:", rscv.best_params_)
print("Best Cross-Validation Score:", rscv.best_score_)

Best Parameters: {'alpha': np.float64(1.0)}
Best Cross-Validation Score: 0.6311208986311619


In [325]:
best_model = rscv.best_estimator_


In [326]:
best_model.score(X_train,y_train)

0.6365859263929415

In [327]:
y_pred = best_model.predict(X_test)
best_model.score(X_test,y_test)

0.6832752261226365

In [328]:
print(root_mean_squared_error(y_test, y_pred))
print(r2_score(y_test, y_pred))

73001.04142301869
0.6832752261226365


In [329]:
feature_weights = pd.DataFrame({
    'Feature': X.columns,
    'Weight': best_model.coef_
})


print(feature_weights)

                      Feature        Weight
0          Number_of_bedrooms   8162.025004
1                 Living_area  35967.514138
2           Number_of_facades  22310.872444
3                 landSurface  26256.446600
4           Has_Assigned_City -15956.904524
5              Brabant_Wallon   5840.441768
6                    Brussels  24536.151924
7               East Flanders -21946.938689
8             Flemish Brabant   -943.500891
9                     Hainaut -17208.725603
10                    Limburg -10273.892135
11                      Liège -27796.899855
12                 Luxembourg -21353.167802
13                      Namur -17888.640819
14              West Flanders -31073.617281
15                Encoded_epc  47164.636266
16  Encoded_state_of_building  18767.220563


**ELASTICNET**

In [330]:
elas = ElasticNet(alpha=1)

In [331]:
#cross-validation peformance
kf = KFold(n_splits=6, shuffle=True, random_state=42)
cv_results = cross_val_score(elas, X_train, y_train, cv=kf)
print(cv_results)
print(np.mean(cv_results), np.std(cv_results))
print(np.quantile(cv_results,[0.025,0.075]))

[0.59287397 0.57518823 0.57308988 0.58575028 0.57785025 0.569154  ]
0.5789844358280332 0.008018483874278602
[0.56964599 0.57062996]


In [332]:
elas.fit(X_train, y_train)

In [333]:
elas.score(X_train,y_train)

0.5830062161103005

In [334]:
y_pred = elas.predict(X_test)
elas.score(X_test,y_test)

0.6307410667657707

In [335]:
print(root_mean_squared_error(y_test, y_pred))
print(r2_score(y_test, y_pred))

78823.10593113455
0.6307410667657707


In [336]:
param_distributions = {'alpha': uniform(0.01, 10),'l1_ratio': uniform(0, 1)}
rscv = RandomizedSearchCV(lasso, param_distributions=param_grid, cv=kf, n_iter=10, random_state=42)
rscv.fit(X_train, y_train)
print("Best Parameters:", rscv.best_params_)
print("Best Cross-Validation Score:", rscv.best_score_)

Best Parameters: {'alpha': np.float64(1.0)}
Best Cross-Validation Score: 0.6311208986311619


In [337]:
best_model = rscv.best_estimator_

In [338]:
best_model.score(X_train,y_train)

0.6365859263929415

In [339]:
y_pred = best_model.predict(X_test)
best_model.score(X_test,y_test)

0.6832752261226365

In [340]:
print(root_mean_squared_error(y_test, y_pred))
print(r2_score(y_test, y_pred))

73001.04142301869
0.6832752261226365


In [341]:
feature_weights = pd.DataFrame({
    'Feature': X.columns,
    'Weight': best_model.coef_
})


print(feature_weights)

                      Feature        Weight
0          Number_of_bedrooms   8162.025004
1                 Living_area  35967.514138
2           Number_of_facades  22310.872444
3                 landSurface  26256.446600
4           Has_Assigned_City -15956.904524
5              Brabant_Wallon   5840.441768
6                    Brussels  24536.151924
7               East Flanders -21946.938689
8             Flemish Brabant   -943.500891
9                     Hainaut -17208.725603
10                    Limburg -10273.892135
11                      Liège -27796.899855
12                 Luxembourg -21353.167802
13                      Namur -17888.640819
14              West Flanders -31073.617281
15                Encoded_epc  47164.636266
16  Encoded_state_of_building  18767.220563


**RANDOM FORREST**

In [342]:
reg_rf = RandomForestRegressor(n_estimators=10, random_state=0, oob_score=True)



In [343]:
#cross-validation peformance
kf = KFold(n_splits=6, shuffle=True, random_state=42)
cv_results = cross_val_score(reg_rf, X_train, y_train, cv=kf)
print(cv_results)
print(np.mean(cv_results), np.std(cv_results))
print(np.quantile(cv_results,[0.025,0.075]))

  warn(
  warn(
  warn(
  warn(
  warn(


[0.59745507 0.51618367 0.53628539 0.48607291 0.59174621 0.54555663]
0.5455499807683019 0.039399587700113856
[0.48983676 0.49736445]


  warn(


In [344]:
reg_rf.fit(X_train, y_train)

  warn(


In [345]:
# Access the OOB Score
oob_score = reg_rf.oob_score_
print(f'Out-of-Bag Score: {oob_score}')

# Making predictions on the same data or new data
y_pred = reg_rf.predict(X_test)

# Evaluating the model
mse = root_mean_squared_error(y_test, y_pred)
print(f'Root Mean Squared Error: {mse}')

r2 = r2_score(y_test, y_pred)
print(f'R-squared: {r2}')


Out-of-Bag Score: 0.3798289702772686
Root Mean Squared Error: 78383.5786601926
R-squared: 0.6348476512176268


In [346]:
# Save data to new csv file

output_csv = r'..\data\clean\model_training.csv'  # Fill your path to file
df.to_csv(output_csv, index=False)


# Save data to new pkl file


output_pkl = r'..\data\clean\model_training.pkl' # Fill your path to file
with open(output_pkl, 'wb') as f:
    pickle.dump(df, f)