# Extract features from params using scripts

In [1]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from langdetect import detect # language detection
import numpy as np # linear algebra
import tqdm # progress bar
import re # regular expressions
import pickle as pkl # saving and loading pythonic data

## Import custom libraries

In [2]:
import sys, os # system specific parameters and functions
sys.path.append(os.getcwd() + "/../") # add parent directory to path
from src.data_preprocessing import DataPreprocessing
from src.params_parser import ParamsParser

## Load the data

In [3]:
dp = DataPreprocessing(df_path="../data/real_estate_ads_2022_10.csv", 
                       train_indices_path="../data/train_indices.npy", 
                       test_indices_path="../data/test_indices.npy", get_params_from_params=False)

In [4]:
dp.X.columns

Index(['market', 'district_lon', 'district_lat', 'params', 'no_rooms', 'm',
       'map_lon', 'map_lat', 'pca_tfidf_titles_0', 'pca_tfidf_titles_1',
       'pca_tfidf_titles_2', 'pca_tfidf_titles_3', 'pca_tfidf_titles_4',
       'pls2_tfidf_titles_0', 'pls2_tfidf_titles_1',
       'pca_tfidf_descriptions_0', 'pca_tfidf_descriptions_1',
       'pca_tfidf_descriptions_2', 'pca_tfidf_descriptions_3',
       'pca_tfidf_descriptions_4', 'pls2_tfidf_descriptions_0',
       'pls2_tfidf_descriptions_1', 'pca_bert_titles_0', 'pca_bert_titles_1',
       'pca_bert_titles_2', 'pca_bert_titles_3', 'pca_bert_titles_4',
       'pls2_bert_titles_0', 'pls2_bert_titles_1', 'title_len',
       'description_len', 'title_words', 'description_words',
       'title_uppercase', 'description_uppercase', 'unique_words_title',
       'unique_words_description', 'average_word_length_title',
       'average_word_length_description', 'total_number_presence_title',
       'total_number_presence_description', 'create

## Get the params feature

In [5]:
import importlib
import src.params_parser
importlib.reload(src.params_parser) # We do this for debugging purposes
# To potentially change the ParamsParser class without needed to reload the whole notebook

params = pd.DataFrame(dp.X["params"])
params_parser = src.params_parser.ParamsParser(params, verbose=True)
params_parser.params

New features added from params :)
param_price cleaned
param_rent cleaned
Ugly param m dropped
Ugly param location dropped
Ugly param roofing dropped
Ugly param remote_services dropped
Ugly param recreational dropped
Ugly params dropped
Lists converted to floats
Params removed:  ['param_price', 'param_rent', 'param_market', 'param_price_per_m', 'param_rooms_num']
Repeated columns removed
Media types cable television corrected
One hot encoding done
Params column dropped
Types transformed


Unnamed: 0,param_free_from,param_floors_num,param_terrain_area,param_floor_no,param_building_floors_num,param_is_bungalow,param_build_year,param_security_types_alarm,param_security_types_anti_burglary_door,param_security_types_closed_area,...,param_building_material_cellular_concrete,param_building_material_concrete,param_building_material_concrete_plate,param_building_material_hydroton,param_building_material_other,param_building_material_reinforced_concrete,param_building_material_silikat,param_building_material_wood,param_construction_status_to_completion,param_construction_status_to_renovation
0,NaT,3.0,,,3.0,,1920.0,0,0,0,...,False,False,False,False,False,False,False,False,False,True
1,NaT,4.0,,,4.0,,,0,0,0,...,False,False,False,False,False,False,False,False,False,False
2,NaT,4.0,,,4.0,,1980.0,0,0,0,...,False,False,False,False,False,False,False,False,False,False
3,NaT,4.0,,,4.0,,1965.0,0,0,0,...,False,False,False,False,False,False,False,False,False,False
4,NaT,4.0,,,4.0,,1936.0,0,0,0,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73640,2019-07-31,4.0,,,4.0,,,0,0,0,...,False,False,False,False,False,False,False,False,False,False
73641,NaT,4.0,,,4.0,,,0,0,0,...,False,False,False,False,False,False,False,False,True,False
73642,NaT,4.0,,,4.0,,1960.0,0,0,0,...,False,False,False,False,False,False,False,False,True,False
73643,NaT,18.0,,,18.0,,2005.0,0,0,0,...,False,False,False,False,False,False,False,False,False,False


In [8]:
for col in params_parser.params.columns:
    if "free" in col:
        print(col)

param_free_from


## Use the data preprocessing pipeline

In [14]:
import importlib
import src.data_preprocessing
importlib.reload(src.data_preprocessing) # We do this for debugging purposes

dp = src.data_preprocessing.DataPreprocessing(df_path="../data/real_estate_ads_2022_10.csv", 
                       train_indices_path="../data/train_indices.npy", 
                       test_indices_path="../data/test_indices.npy", get_params_from_params=True,
                       verbose=True)

Data loaded
The total number of columns is 12
Params parsed. New columns added
The total number of columns is 91
TF-IDF embeddings added
The total number of columns is 105
BERT embeddings added
The total number of columns is 112
Textual features added
The total number of columns is 124
Text columns dropped
The total number of columns is 122
Time features transformed
The total number of columns is 132
Cyclic features transformed
The total number of columns is 136
Nonfrequent params removed
The total number of columns is 116
Missing params removed
The total number of columns is 110


In [15]:
dp.X

Unnamed: 0,market,district_lon,district_lat,no_rooms,m,map_lon,map_lat,param_floors_num,param_building_floors_num,param_security_types_alarm,...,updated_at_day,duration_of_update,created_at_first_hour_sin,created_at_first_hour_cos,created_at_first_dayofweek_sin,created_at_first_dayofweek_cos,updated_at_hour_sin,updated_at_hour_cos,updated_at_dayofweek_sin,updated_at_dayofweek_cos
0,secondary,16.90502,52.41180,2,51.00,16.896592,52.410150,3.0,3.0,0,...,15,528104.0,-9.422609e-01,-0.334880,8.660254e-01,0.5,-9.790841e-01,0.203456,0.000000e+00,1.0
1,secondary,16.87466,52.41572,1,26.00,16.904410,52.411919,4.0,4.0,0,...,27,2744173.0,-9.976688e-01,-0.068242,1.224647e-16,-1.0,1.361666e-01,-0.990686,0.000000e+00,1.0
2,secondary,16.90502,52.41180,3,60.10,16.899310,52.413357,4.0,4.0,0,...,28,104836578.0,-9.422609e-01,-0.334880,1.224647e-16,-1.0,2.697968e-01,0.962917,-2.449294e-16,1.0
3,secondary,16.87466,52.41572,2,47.00,16.883608,52.410662,4.0,4.0,0,...,15,3554393.0,-1.361666e-01,-0.990686,1.224647e-16,-1.0,-9.422609e-01,-0.334880,8.660254e-01,-0.5
4,secondary,16.87466,52.41572,2,44.00,16.890796,52.412308,4.0,4.0,0,...,13,2939508.0,-6.310879e-01,-0.775711,-8.660254e-01,-0.5,-8.169699e-01,-0.576680,1.224647e-16,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73640,secondary,16.94080,52.38356,3,47.00,16.955818,52.389898,4.0,4.0,0,...,14,7777414.0,-2.697968e-01,0.962917,8.660254e-01,0.5,-2.449294e-16,1.000000,0.000000e+00,1.0
73641,secondary,16.94080,52.38356,2,43.70,16.947615,52.392968,4.0,4.0,0,...,7,1825464.0,1.361666e-01,-0.990686,8.660254e-01,-0.5,-6.310879e-01,-0.775711,8.660254e-01,-0.5
73642,secondary,16.90590,52.36596,2,45.32,16.904862,52.369679,4.0,4.0,0,...,23,557407.0,-3.984011e-01,-0.917211,8.660254e-01,0.5,0.000000e+00,1.000000,8.660254e-01,0.5
73643,secondary,16.94080,52.38356,1,36.30,16.980725,52.389168,18.0,18.0,0,...,17,93299047.0,-7.308360e-01,0.682553,8.660254e-01,-0.5,-9.976688e-01,-0.068242,-8.660254e-01,-0.5
