# Data Integration

In [1]:
import pandas as pd
import json
import os
from bs4 import BeautifulSoup
try:
    import pandas_read_xml as pdx
    import tabula
except:
    !pip install pandas_read_xml xlrd tabula-py
    import pandas_read_xml as pdx
import tabula
from functools import reduce
import numpy as np
from math import radians, cos, sin, asin, sqrt, atan2, pi 
import shapefile
from shapely.geometry import Point # Point class
from shapely.geometry import shape # shape() is a function to convert geo objects through the interface
from multiprocessing import Pool
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer, Normalizer

In [2]:
r = 6378
# The length of a degree in the equator in kms
deg_len = (2 * pi * r) / 360


def distance(p1, p2):
    # distance in kilometers = euclidean distance * length of a degree in equator in kms
    return round(np.linalg.norm(p2 - p1) * deg_len, 3)

def between_times(time):
    time = [int(i) for i in time.split(':')]
    time_ = 3600 * time[0] + 60 * time[1] + time[2]
    if time_ > 3600 * 7 and time_ < 3600 * 9:
        return True
    return False

In [3]:
hospitals = open('29893909/hospitals.html')
hospitals = hospitals.read()
hospitals = pd.read_html(hospitals)[0].drop(['Unnamed: 0'], axis=1)

h_id = list(hospitals.id)
lat = [round(i, 6) for i in list(hospitals.lat)]
lng = [round(i, 6) for i in list(hospitals.lng)]
h_coordinates = [np.array((la, ln)) for la, ln in zip(lat, lng)]

In [4]:
supermarkets = pd.read_excel("29893909/supermarkets.xlsx").drop(['Unnamed: 0'], axis=1)
sm_id = list(supermarkets.id)
lat = [round(i, 6) for i in list(supermarkets.lat)]
lng = [round(i, 6) for i in list(supermarkets.lng)]
sm_coordinates = [np.array((la, ln)) for la, ln in zip(lat, lng)]

In [5]:
real_state = open('29893909/real_state.xml').read()[2:-1]
soup = BeautifulSoup(real_state)
tags = [item.name for item in soup.root.childGenerator()]
data = {}
for i, child in enumerate(soup.body.root.children):
    data[tags[i]] = []
    for grandchild in child.children:
        data[tags[i]].append(grandchild.text)
real_state_xml = pd.DataFrame(data, columns=tags)
real_state_json = pd.read_json('29893909/real_state.json')
real_state = real_state_json.append(real_state_xml)

In [6]:
shoppingcenters = tabula.read_pdf('29893909/shopingcenters.pdf', pages='all')
shoppingcenters = reduce(lambda a, b: a.append(b), shoppingcenters)

sc_id = list(shoppingcenters.sc_id)
lat = [round(i, 6) for i in list(shoppingcenters.lat)]
lng = [round(i, 6) for i in list(shoppingcenters.lng)]
sc_coordinates = [np.array((la, ln)) for la, ln in zip(lat, lng)]

In [7]:
stops = pd.read_csv("GTFS - Melbourne Train Information/stops.txt")
ts_id = list(stops.stop_id)
lat = [round(i, 6) for i in list(stops.stop_lat)]
lng = [round(i, 6) for i in list(stops.stop_lon)]
ts_coordinates = [np.array((la, ln)) for la, ln in zip(lat, lng)]

In [8]:
stop_times = pd.read_csv("GTFS - Melbourne Train Information/stop_times.txt")
calendar = pd.read_csv("GTFS - Melbourne Train Information/calendar.txt")
trips = pd.read_csv("GTFS - Melbourne Train Information/trips.txt")
routes = pd.read_csv("GTFS - Melbourne Train Information/routes.txt")

In [9]:
def in_weekdays(service_id):
    temp = 0
    for i in ["monday", "tuesday", "wednesday", "thursday", "friday"]: 
        temp += calendar[calendar.service_id==service_id][i].values[0]
    if temp > 0:
        return True
    return False

In [10]:
def closest(location, id, coords):
    distances = [distance(location, coord) for coord in coords]
    return id[distances.index(min(distances))], min(distances)

def coord(df, index):
    return np.array((
        df.loc[index, 'lat'], 
        df.loc[index, 'lng']
    ))

In [11]:
# Appending empty columns to the dataframe and populating them with default values
lrs = len(real_state)
real_state['suburb'] = ["not available"]*lrs
real_state['Shopping_center_id'] = ["not available"]*lrs
real_state['Distance_to_sc'] = [0]*lrs
real_state['Train_station_id'] = ["not available"]*lrs
real_state['Distance_to_train_station'] = [0]*lrs
real_state['travel_min_to_CBD'] = [0]*lrs
real_state['Transfer_flag'] = [-1]*lrs
real_state['Hospital_id'] = ["not available"]*lrs
real_state['Distance_to_hospital'] = [0]*lrs
real_state['Supermarket_id'] = ["not available"]*lrs
real_state['Distance_to_supermarket'] = [0]*lrs

In [12]:
%%time
# handling cases where the hour in time > 24
# super inefficient: change if there is time
    
for index in stop_times[stop_times.arrival_time.str.match("24:\d{1,2}:\d{1,2}")].index:
    time = stop_times.loc[index, 'arrival_time'].split(":")
    time = ":".join(["00", time[1], time[2]])
    stop_times.loc[index, 'arrival_time'] = time


for index in stop_times[stop_times.departure_time.str.match("24:\d{1,2}:\d{1,2}")].index:
    time = stop_times.loc[index, 'departure_time'].split(":")
    time = ":".join(["00", time[1], time[2]])
    stop_times.loc[index, 'departure_time'] = time
    
for index in stop_times[stop_times.arrival_time.str.match("25:\d{1,2}:\d{1,2}")].index:
    time = stop_times.loc[index, 'arrival_time'].split(":")
    time = ":".join(["01", time[1], time[2]])
    stop_times.loc[index, 'arrival_time'] = time
    
for index in stop_times[stop_times.departure_time.str.match("25:\d{1,2}:\d{1,2}")].index:
    time = stop_times.loc[index, 'departure_time'].split(":")
    time = ":".join(["01", time[1], time[2]])
    stop_times.loc[index, 'departure_time'] = time
    
for index in stop_times[stop_times.arrival_time.str.match("26:\d{1,2}:\d{1,2}")].index:
    time = stop_times.loc[index, 'arrival_time'].split(":")
    time = ":".join(["02", time[1], time[2]])
    stop_times.loc[index, 'arrival_time'] = time
    
for index in stop_times[stop_times.departure_time.str.match("26:\d{1,2}:\d{1,2}")].index:
    time = stop_times.loc[index, 'departure_time'].split(":")
    time = ":".join(["02", time[1], time[2]])
    stop_times.loc[index, 'departure_time'] = time
    
stop_times['departure_time'] = pd.to_datetime(stop_times['departure_time'], format='%H:%M:%S').dt.time
stop_times['arrival_time'] = pd.to_datetime(stop_times['arrival_time'], format='%H:%M:%S').dt.time 

CPU times: user 1min 9s, sys: 330 ms, total: 1min 9s
Wall time: 1min 10s


In [13]:
start = pd.to_datetime("07:00:00", format="%H:%M:%S").time()
end = pd.to_datetime("09:00:00", format="%H:%M:%S").time()

stop_times_7_9 = stop_times[(stop_times['departure_time'] > start) & (stop_times['departure_time'] < end) & 
           (stop_times['arrival_time'] > start) &  (stop_times['arrival_time'] < end)]

In [14]:
trips_flinders_street = trips[trips.trip_headsign=='City (Flinders Street)']

In [15]:
# Transfer_flag
transfer_flag = {}
stop_times_trips = pd.merge(left=stop_times_7_9, right=trips_flinders_street, on='trip_id')
for stop in list(stops.stop_id):
    if not stop_times_trips[stop_times_trips.stop_id == stop].empty:
        transfer_flag[stop] = 1
    else:
        transfer_flag[stop] = 0

In [16]:
# Suburb
shp = shapefile.Reader('vic_suburb_boundary/VIC_LOCALITY_POLYGON_shp.dbf') #open the shapefile
all_shapes = shp.shapes() # get all the polygons
all_records = shp.records()     

def get_suburb(inp):
#     point, property_id = inp
    try:
        property_id, point = inp
        for i in range(len(all_shapes)):
            boundary = all_shapes[i] # get a boundary polygon
            if Point(point).within(shape(boundary)):
                return (property_id, all_records[i].as_dict()['VIC_LOCA_2'])
        return ("", "")
    except:
        pass

In [17]:
if 'prop2sub.json' not in os.listdir("."):
    lat = list(real_state.lat)
    lng = list(real_state.lng)
    coords = [(i, j) for i, j in zip(lng, lat)]
    property_ids = list(real_state.property_id)
    inp = [(pid, coord) for pid, coord in zip(property_ids, coords)]
    
    with Pool(4) as p:
        property2suburb = p.map(get_suburb, inp)
else:
    with open('prop2sub.json', 'r') as f:
        property2suburb = json.load(f)
    prop2sub = {}
    for i in property2suburb:
        try:
            prop2sub[i[0]] = i[1]
        except:
            pass

In [18]:
sto = pd.merge(left=stop_times_7_9, right=trips_flinders_street, on='trip_id')

def travel_min_to_CBD(stop_id):
    times = []
    ts = sto[sto.stop_id == stop_id].trip_id.unique()
    for t in ts:
        try:
            st_time = sto[(sto.trip_id == t) & (sto.stop_id == stop_id)].departure_time
            re_time = sto[(sto.trip_id == t) & (sto.stop_id == 19854)].departure_time
            re_time = pd.to_datetime(re_time, format='%H:%M:%S').values[0]
            st_time = pd.to_datetime(st_time, format='%H:%M:%S').values[0]
            times.append(pd.Timedelta(re_time - st_time).seconds / 60)
        except Exception as e:
            pass
    try:
        return (stop_id, round(sum(times) / len(times), 2))
    except:
        pass


In [19]:
%%time
stop_ids = list(stops.stop_id.unique())
with Pool(4) as p:
    stopid2cbd = p.map(travel_min_to_CBD, stop_ids)

CPU times: user 33 ms, sys: 71.9 ms, total: 105 ms
Wall time: 40 s


In [20]:
stid2cbd = {}
for i in stopid2cbd:
    try:
        stid2cbd[i[0]] = i[1]
    except:
        pass

In [39]:
%%time
for index in real_state.index:
    try:
        lat = round(real_state.loc[index, 'lat'].values[0], 4)
        lng = round(real_state.loc[index, 'lng'].values[0], 4)
        real_state.loc[index, 'lat'] = lat
        real_state.loc[index, 'lng'] = lng
    except:
        lat = round(real_state.loc[index, 'lat'], 4)
        lng = round(real_state.loc[index, 'lng'], 4)
        real_state.loc[index, 'lat'] = lat
        real_state.loc[index, 'lng'] = lng

CPU times: user 2.33 s, sys: 3.95 ms, total: 2.34 s
Wall time: 2.34 s


In [None]:
%%time
for index in real_state.index:
    try:
        try:
            lat = real_state.loc[index, 'lat'].values[0]
            lng = real_state.loc[index, 'lng'].values[0]
            location = np.array((lat, lng))
        except:
            lat = real_state.loc[index, 'lat']
            lng = real_state.loc[index, 'lng']
            location = np.array((lat, lng))
            print(type(lat))
            
        try:
            real_state.loc[index, 'suburb'] = prop2sub[real_state.loc[index, 'property_id'].values[0]]
        except Exception as e:
            print(e)
        
        id, dist = closest(location, sc_id, sc_coordinates)
        real_state.loc[index, 'Shopping_center_id'] = id
        real_state.loc[index, 'Distance_to_sc'] = dist
        
        id, dist = closest(location, ts_id, ts_coordinates)
        real_state.loc[index, 'Train_station_id'] = id
        real_state.loc[index, 'Distance_to_train_station'] = dist
        
        real_state.loc[index, 'Transfer_flag'] = transfer_flag[id]
        try:
            real_state.loc[index, 'travel_min_to_CBD'] = stid2cbd[id]
        except:
            real_state.loc[index, 'travel_min_to_CBD'] = 0
            
        
        id, dist = closest(location, h_id, h_coordinates)
        real_state.loc[index, 'Hospital_id'] = id
        real_state.loc[index, 'Distance_to_hospital'] = dist

        id, dist = closest(location, sm_id, sm_coordinates)
        real_state.loc[index, 'Supermarket_id'] = id
        real_state.loc[index, 'Distance_to_supermarket'] = dist
    except Exception as e:
        pass

<class 'float'>
1001 'int' object has no attribute 'values'
<class 'float'>
1002 'int' object has no attribute 'values'
<class 'float'>
1003 'int' object has no attribute 'values'
<class 'float'>
1004 'int' object has no attribute 'values'
<class 'float'>
1005 'int' object has no attribute 'values'
<class 'float'>
1006 'int' object has no attribute 'values'
<class 'float'>
1007 'int' object has no attribute 'values'
<class 'float'>
1008 'int' object has no attribute 'values'
<class 'float'>
1009 'int' object has no attribute 'values'


In [77]:
real_state

Unnamed: 0,property_id,lat,lng,addr_street,price,property_type,year,bedrooms,bathrooms,parking_space,...,Shopping_center_id,Distance_to_sc,Train_station_id,Distance_to_train_station,travel_min_to_CBD,Transfer_flag,Hospital_id,Distance_to_hospital,Supermarket_id,Distance_to_supermarket
0,41387,-37.7451,145.065,1/46 Hillside Road,7200000,house,2015,3,2,2,...,SC_017,1.257,19936,0.289,28.68,1,hospital_066,1.179,S_100,1.617
1,72107,-37.8387,145.263,19 CHURCH STREET,19550000,house,2008,3,2,2,...,SC_032,4.751,19870,0.693,50.27,1,hospital_002,1.964,S_139,0.666
2,51703,-37.7807,145.124,84 Williamsons Road,9630000,house,2013,3,1,2,...,SC_091,0.741,20042,4.298,30.26,1,hospital_194,1.285,S_188,0.701
3,37969,-37.7756,145.019,126 Arthur Street,7380000,house,2014,3,2,0,...,SC_001,2.625,19930,0.491,18.80,1,hospital_029,1.618,S_219,2.605
4,92396,-37.9746,145.061,68 Latrobe street,12558000,house,2015,3,1,2,...,SC_003,3.126,19865,0.921,41.28,1,hospital_133,0.965,S_011,0.787
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
996,24209,-37.903,145.045,24 Beccles Street,7684000,house,2011,3,1,1,...,SC_041,2.531,19941,0.611,34.12,1,hospital_126,2.876,S_107,1.870
997,80370,-37.8518,145.234,56 Putt Grove,9975000,house,2016,4,2,2,...,SC_032,4.392,19871,2.843,46.27,1,hospital_179,0.819,S_226,2.296
998,75144,-37.7188,145.088,4/26A Howe Street,3618000,house,2009,2,1,1,...,SC_017,4.656,19984,0.996,34.71,1,hospital_075,2.495,S_020,2.059
999,80699,-37.6505,145.049,10 McNab Court,3036000,house,2008,3,1,1,...,SC_062,4.099,45795,2.046,46.43,1,hospital_053,3.587,S_096,2.830


In [78]:
# There are still some datapoints that need to be processed
# It is because these coordinates are floats
remaining = real_state[real_state.Distance_to_sc == 0]
real_state[real_state.Distance_to_sc == 0]

Unnamed: 0,property_id,lat,lng,addr_street,price,property_type,year,bedrooms,bathrooms,parking_space,...,Shopping_center_id,Distance_to_sc,Train_station_id,Distance_to_train_station,travel_min_to_CBD,Transfer_flag,Hospital_id,Distance_to_hospital,Supermarket_id,Distance_to_supermarket
1001,2383,-37.7863,144.883,6 Inkerman Street,8635000,house,2015,3,2,2,...,not available,0.0,not available,0.0,0.0,-1,not available,0.0,not available,0.0
1002,44838,-37.7123,145.099,3 Kell Street,7500000,house,2016,5,1,0,...,not available,0.0,not available,0.0,0.0,-1,not available,0.0,not available,0.0
1003,30704,-37.7601,144.987,133 Woolton Avenue,8970000,house,2012,2,1,2,...,not available,0.0,not available,0.0,0.0,-1,not available,0.0,not available,0.0
1004,87063,-37.9119,145.004,72 William Street,14960000,house,2016,3,2,2,...,not available,0.0,not available,0.0,0.0,-1,not available,0.0,not available,0.0
1005,70034,-37.865,145.095,4 Closter Avenue,11968000,house,2011,5,2,2,...,not available,0.0,not available,0.0,0.0,-1,not available,0.0,not available,0.0
1006,45354,-37.6971,145.093,6 Yando Street,5634000,house,2010,5,2,2,...,not available,0.0,not available,0.0,0.0,-1,not available,0.0,not available,0.0
1007,59483,-37.8132,145.19,17 Gillies Street,4800000,house,2013,3,1,0,...,not available,0.0,not available,0.0,0.0,-1,not available,0.0,not available,0.0
1008,35297,-37.7286,144.994,6 Crispe Street,5814000,house,2009,2,1,1,...,not available,0.0,not available,0.0,0.0,-1,not available,0.0,not available,0.0
1009,40726,-37.6939,145.053,4a Dealing Court,6885000,house,2014,3,1,2,...,not available,0.0,not available,0.0,0.0,-1,not available,0.0,not available,0.0


In [72]:
for index in remaining.index:
    lat = real_state.loc[index, 'lat']
    lng = real_state.loc[index, 'lng']
    location = np.array((lat, lng))

# Data Reshaping

## Standardization

In [43]:
standard_scaler = StandardScaler()
real_state_standard = real_state.copy()
columns = ['price', 'Distance_to_sc', 'Distance_to_hospital', 'travel_min_to_CBD']
real_state_standard[columns] = standard_scaler.fit_transform(real_state_standard[columns])

In [48]:
real_state_standard[columns].describe()

Unnamed: 0,price,Distance_to_sc,Distance_to_hospital,travel_min_to_CBD
count,2011.0,2011.0,2011.0,2011.0
mean,0.0,-5.3882530000000007e-17,-3.621613e-17,2.791292e-16
std,1.000249,1.000249,1.000249,1.000249
min,-1.310741,-1.905776,-1.273889,-1.61504
25%,-0.677822,-0.7793589,-0.6855691,-0.5015665
50%,-0.289618,-0.1361306,-0.2948231,-0.04894053
75%,0.35147,0.6528051,0.3189268,0.3975412
max,5.792723,3.00801,3.860093,12.99461


In [49]:
real_state_standard[columns]

Unnamed: 0,price,Distance_to_sc,Distance_to_hospital,travel_min_to_CBD
0,-0.316256,-1.047911,-0.697306,-0.309730
1,1.876938,1.340053,-0.315363,0.672894
2,0.115279,-1.402113,-0.645467,-0.237819
3,-0.284290,-0.113609,-0.484572,-0.759397
4,0.635253,0.226944,-0.802451,0.263733
...,...,...,...,...
996,-0.230304,-0.177761,0.130156,-0.062139
997,0.176547,1.093681,-0.871406,0.490843
998,-0.952371,1.272488,-0.054703,-0.035287
999,-1.055727,0.893035,0.479333,0.498125


In [50]:
minmax_scaler = MinMaxScaler()
real_state_minmax = real_state.copy()
real_state_minmax[columns] = standard_scaler.fit_transform(real_state_minmax[columns])

In [53]:
real_state_minmax[columns].describe()

Unnamed: 0,price,Distance_to_sc,Distance_to_hospital,travel_min_to_CBD
count,2011.0,2011.0,2011.0,2011.0
mean,0.0,-5.3882530000000007e-17,-3.621613e-17,2.791292e-16
std,1.000249,1.000249,1.000249,1.000249
min,-1.310741,-1.905776,-1.273889,-1.61504
25%,-0.677822,-0.7793589,-0.6855691,-0.5015665
50%,-0.289618,-0.1361306,-0.2948231,-0.04894053
75%,0.35147,0.6528051,0.3189268,0.3975412
max,5.792723,3.00801,3.860093,12.99461


In [52]:
real_state_minmax[columns]

Unnamed: 0,price,Distance_to_sc,Distance_to_hospital,travel_min_to_CBD
0,-0.316256,-1.047911,-0.697306,-0.309730
1,1.876938,1.340053,-0.315363,0.672894
2,0.115279,-1.402113,-0.645467,-0.237819
3,-0.284290,-0.113609,-0.484572,-0.759397
4,0.635253,0.226944,-0.802451,0.263733
...,...,...,...,...
996,-0.230304,-0.177761,0.130156,-0.062139
997,0.176547,1.093681,-0.871406,0.490843
998,-0.952371,1.272488,-0.054703,-0.035287
999,-1.055727,0.893035,0.479333,0.498125


In [54]:
log_normalized = real_state.copy()
for index in log_normalized.index:
    for column in columns:
        val = log_normalized.loc[index, column]
        try:
            log_normalized.loc[index, column] = np.log(val)
        except:
            log_normalized.loc[index, column] = np.log(val.values[0])

  log_normalized.loc[index, column] = np.log(val)
  log_normalized.loc[index, column] = np.log(val)
  log_normalized.loc[index, column] = np.log(val)
  log_normalized.loc[index, column] = np.log(val)
  log_normalized.loc[index, column] = np.log(val)
  log_normalized.loc[index, column] = np.log(val)
  log_normalized.loc[index, column] = np.log(val)
  log_normalized.loc[index, column] = np.log(val)
  log_normalized.loc[index, column] = np.log(val)
  log_normalized.loc[index, column] = np.log(val)
  log_normalized.loc[index, column] = np.log(val)
  log_normalized.loc[index, column] = np.log(val)
  log_normalized.loc[index, column] = np.log(val)
  log_normalized.loc[index, column] = np.log(val)
  log_normalized.loc[index, column] = np.log(val)
  log_normalized.loc[index, column] = np.log(val)
  log_normalized.loc[index, column] = np.log(val)
  log_normalized.loc[index, column] = np.log(val)
  log_normalized.loc[index, column] = np.log(val)
  log_normalized.loc[index, column] = np.log(val)


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **k

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **k

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **k

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **k

In [56]:
log_normalized[columns]

Unnamed: 0,price,Distance_to_sc,Distance_to_hospital,travel_min_to_CBD
0,2.75935,-1.475222,-1.803832,1.210809
1,2.82069,0.444306,-0.396092,1.365430
2,2.7776,,-1.383264,1.226661
3,2.76091,-0.035149,-0.736649,1.076318
4,2.79398,0.130532,,1.313825
...,...,...,...,...
996,2.71297,-0.073636,0.053219,1.261265
997,2.75619,0.392358,,1.344037
998,2.76318,0.430724,-0.090486,1.266110
999,2.74201,0.344462,0.244324,1.344937


In [30]:
# real_state_boxcox = real_state.copy()
# pt = PowerTransformer(method='box-cox')
# normalizer = Normalizer()

# real_state_boxcox[[columns]] = normalizer.fit_transform(real_state_boxcox[columns])
# # real_state_boxcox[[columns]] = pt.fit_transform(real_state_boxcox[columns])
        