In [1]:
import models
import pandas as pd
from sklearn.pipeline import make_pipeline
from preprocess import TypeConverter, KBinsDiscretizerWithNames, OneHotEncoderOnlyCategorical, CreateInteractions
from splitter import StratifiedRegressionSplit
from sklearn.metrics import r2_score
import numpy as np

In [2]:
variables = pd.read_excel('../data/covariates.xlsx')
data = pd.read_excel('../data/hp_ljubljana_new_with_rooms.xlsx')

variables = variables.to_dict(orient='records')

target = []
features = []
for feature in variables:
    feature['type'] = int if feature['type'] == 'int' else float if feature['type'] == 'float' else str
    if feature['group'] == 'target':
        target = feature
        continue
    else:
        features.append(feature)

X = data[[feature['name'] for feature in features]]
y = data[target['name']]

# PREPROCESS PIPLINE
preprocc_pip = make_pipeline(TypeConverter(variables=features), KBinsDiscretizerWithNames(
                            encode='onehot-dense',
                            strategy='quantile',
                            random_state=0,
                            n_bins=2),
            OneHotEncoderOnlyCategorical())
preprocc_pip.fit(X)
X = preprocc_pip.transform(X)






In [8]:
ci = CreateInteractions()
ci.fit(X)
X_t = ci.transform(X)

In [10]:
list(X_t.columns)

['dist_airport',
 'dist_highway_entr',
 'dist_main_roads',
 'dist_public_transport',
 'dist_recreation',
 'dist_regional_roads',
 'dist_river',
 'dist_schools',
 'construct_age',
 'facade_age',
 'inst_age',
 'roof_age',
 'windows_age',
 'dist_highway',
 'dist_railway',
 'elevation',
 'easting',
 'id_building',
 'id_cadas_com',
 'northing',
 'floor_above_ground',
 'floor_appartment',
 'floor_entrance',
 'floors_total',
 'living_area',
 'no_appart',
 'no_rooms',
 'total_area',
 'dist_airport_[11175.8671875, 18748.06640625)',
 'dist_airport_[18748.06640625, 25051.1484375)',
 'dist_highway_entr_[100.0, 1431.78210449219)',
 'dist_highway_entr_[1431.78210449219, 4341.65869140625)',
 'dist_main_roads_[0.0, 200.0)',
 'dist_main_roads_[200.0, 4110.9609375)',
 'dist_public_transport_[0.0, 200.0)',
 'dist_public_transport_[200.0, 4134.00537109375)',
 'dist_recreation_[100.0, 1044.03063964844)',
 'dist_recreation_[1044.03063964844, 5280.1513671875)',
 'dist_regional_roads_[0.0, 2061.552734375)',
 

In [3]:
categorical_features = { n: t for n, t in X.dtypes.to_dict().items() if t in [ np.dtype('O') ]}
numerical_features = { n: t for n, t in X.dtypes.to_dict().items() if t not in [ np.dtype('O') ]}
cat_cols = list(categorical_features.keys())
num_cols = list(numerical_features.keys())
bin_num_cols = []
nonbin_num_cols = []
for num_col in num_cols:
    is_bin = True
    for val in X[num_col]:
        if (np.abs(1-val) > 1e-3) and (np.abs(val) > 1e-3):
            is_bin = False
    
    if is_bin:
        bin_num_cols.append(num_col)
    else:
        nonbin_num_cols.append(num_col)

binary_numerical_features = { n: t for n, t in X.dtypes.to_dict().items() if n in bin_num_cols}
nonbinary_numerical_features = { n: t for n, t in X.dtypes.to_dict().items() if n in nonbin_num_cols}

feature_names_in_ = np.array(X.columns, dtype=object)
n_features_in_ = len(feature_names_in_)
# return self

In [5]:
nonbin_num_cols

['dist_airport',
 'dist_highway_entr',
 'dist_main_roads',
 'dist_public_transport',
 'dist_recreation',
 'dist_regional_roads',
 'dist_river',
 'dist_schools',
 'construct_age',
 'facade_age',
 'inst_age',
 'roof_age',
 'windows_age',
 'dist_highway',
 'dist_railway',
 'elevation',
 'easting',
 'id_building',
 'id_cadas_com',
 'northing',
 'floor_above_ground',
 'floor_appartment',
 'floor_entrance',
 'floors_total',
 'living_area',
 'no_appart',
 'no_rooms',
 'total_area']

In [8]:
type_dict = {}
type_dict.update(categorical_features)
type_dict.update(binary_numerical_features)
type_dict.update(
    {f: np.dtype('float32') for f in get_dicr_names_out()}
)
disc_tt = super().transform(X[num_cols])

{'dist_airport_[11175.8671875, 18748.06640625)': dtype('float64'),
 'dist_airport_[18748.06640625, 25051.1484375)': dtype('float64'),
 'dist_highway_entr_[100.0, 1431.78210449219)': dtype('float64'),
 'dist_highway_entr_[1431.78210449219, 4341.65869140625)': dtype('float64'),
 'dist_main_roads_[0.0, 200.0)': dtype('float64'),
 'dist_main_roads_[200.0, 4110.9609375)': dtype('float64'),
 'dist_public_transport_[0.0, 200.0)': dtype('float64'),
 'dist_public_transport_[200.0, 4134.00537109375)': dtype('float64'),
 'dist_recreation_[100.0, 1044.03063964844)': dtype('float64'),
 'dist_recreation_[1044.03063964844, 5280.1513671875)': dtype('float64'),
 'dist_regional_roads_[0.0, 2061.552734375)': dtype('float64'),
 'dist_regional_roads_[2061.552734375, 3676.95532226562)': dtype('float64'),
 'dist_river_[0.0, 583.09521484375)': dtype('float64'),
 'dist_river_[583.09521484375, 2308.67919921875)': dtype('float64'),
 'dist_schools_[0.0, 200.0)': dtype('float64'),
 'dist_schools_[200.0, 3600.0)': 

In [6]:
cols = []
for num_col in nonbin_num_cols:
    for cat_col in bin_num_cols:
        cols.append(f"{num_col} {cat_col}")
for i in range(len(bin_num_cols)):
    for j in range(i+1,len(bin_num_cols)):
        cols.append(f"{bin_num_cols[i]} {bin_num_cols[j]}")


In [7]:
cols

['dist_airport dist_airport_[11175.8671875, 18748.06640625)',
 'dist_airport dist_airport_[18748.06640625, 25051.1484375)',
 'dist_airport dist_highway_entr_[100.0, 1431.78210449219)',
 'dist_airport dist_highway_entr_[1431.78210449219, 4341.65869140625)',
 'dist_airport dist_main_roads_[0.0, 200.0)',
 'dist_airport dist_main_roads_[200.0, 4110.9609375)',
 'dist_airport dist_public_transport_[0.0, 200.0)',
 'dist_airport dist_public_transport_[200.0, 4134.00537109375)',
 'dist_airport dist_recreation_[100.0, 1044.03063964844)',
 'dist_airport dist_recreation_[1044.03063964844, 5280.1513671875)',
 'dist_airport dist_regional_roads_[0.0, 2061.552734375)',
 'dist_airport dist_regional_roads_[2061.552734375, 3676.95532226562)',
 'dist_airport dist_river_[0.0, 583.09521484375)',
 'dist_airport dist_river_[583.09521484375, 2308.67919921875)',
 'dist_airport dist_schools_[0.0, 200.0)',
 'dist_airport dist_schools_[200.0, 3600.0)',
 'dist_airport construct_age_[0.0, 13327.0)',
 'dist_airport c

In [34]:
X_t = pd.DataFrame(data=np.zeros((len(X), len(nonbin_num_cols) * len(bin_num_cols) + int((len(bin_num_cols)*(len(bin_num_cols) - 1))/2))), columns=cols)

for num_col in nonbin_num_cols:
    for cat_col in bin_num_cols:
        X_t[f"{num_col} {cat_col}"] = X[num_col]*X[cat_col]

for i in range(len(bin_num_cols)):
    for j in range(i+1,len(bin_num_cols)):
        X_t[f"{bin_num_cols[i]} {bin_num_cols[j]}"] = X[bin_num_cols[i]]*X[bin_num_cols[j]]
           

In [33]:
df.columns


Index(['dist_airport dist_airport_[11175.8671875, 18748.06640625)',
       'dist_airport dist_airport_[18748.06640625, 25051.1484375)',
       'dist_airport dist_highway_entr_[100.0, 1431.78210449219)',
       'dist_airport dist_highway_entr_[1431.78210449219, 4341.65869140625)',
       'dist_airport dist_main_roads_[0.0, 200.0)',
       'dist_airport dist_main_roads_[200.0, 4110.9609375)',
       'dist_airport dist_public_transport_[0.0, 200.0)',
       'dist_airport dist_public_transport_[200.0, 4134.00537109375)',
       'dist_airport dist_recreation_[100.0, 1044.03063964844)',
       'dist_airport dist_recreation_[1044.03063964844, 5280.1513671875)',
       ...
       'postion_type_attics postion_type_basement',
       'postion_type_attics postion_type_floor',
       'postion_type_attics postion_type_ground_floor',
       'postion_type_attics postion_type_other',
       'postion_type_basement postion_type_floor',
       'postion_type_basement postion_type_ground_floor',
       'pos

In [16]:
np.zeros((len(X), len(nonbin_num_cols) * len(bin_num_cols)))

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [19]:
(len(bin_num_cols)*(len(bin_num_cols) - 1))/2

2850.0

In [27]:
df

Unnamed: 0,"dist_airport dist_airport_[11175.8671875, 18748.06640625)","dist_airport dist_airport_[18748.06640625, 25051.1484375)","dist_airport dist_highway_entr_[100.0, 1431.78210449219)","dist_airport dist_highway_entr_[1431.78210449219, 4341.65869140625)","dist_airport dist_main_roads_[0.0, 200.0)","dist_airport dist_main_roads_[200.0, 4110.9609375)","dist_airport dist_public_transport_[0.0, 200.0)","dist_airport dist_public_transport_[200.0, 4134.00537109375)","dist_airport dist_recreation_[100.0, 1044.03063964844)","dist_airport dist_recreation_[1044.03063964844, 5280.1513671875)",...,postion_type_attics postion_type_basement,postion_type_attics postion_type_floor,postion_type_attics postion_type_ground_floor,postion_type_attics postion_type_other,postion_type_basement postion_type_floor,postion_type_basement postion_type_ground_floor,postion_type_basement postion_type_other,postion_type_floor postion_type_ground_floor,postion_type_floor postion_type_other,postion_type_ground_floor postion_type_other
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5682,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5683,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5684,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5685,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
