# Data Integration

In [1]:
import pandas as pd
import json
import os
try:
    from bs4 import BeautifulSoup
    import pandas_read_xml as pdx
    import tabula
    import statsmodels.api as sm
    from sklearn import linear_model
    from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer, Normalizer
    from shapely.geometry import Point # Point class
    from shapely.geometry import shape # shape() is a function to convert geo objects through the interface
    import numpy as np
    import shapefile
except:
    !pip install pandas_read_xml xlrd tabula-py statsmodels Shapely shapefile bs4 numpy
    import pandas_read_xml as pdx
    from bs4 import BeautifulSoup
    import pandas_read_xml as pdx
    import tabula
    import statsmodels.api as sm
    from sklearn import linear_model
    from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer, Normalizer
    from shapely.geometry import Point # Point class
    from shapely.geometry import shape # shape() is a function to convert geo objects through the interface
    import numpy as np
    import shapefile
from functools import reduce
from math import radians, cos, sin, asin, sqrt, atan2, pi 
from multiprocessing import Pool

In [2]:
r = 6378
# The length of a degree in the equator in kms
deg_len = (2 * pi * r) / 360


def distance(p1, p2):
    # distance in kilometers = euclidean distance * length of a degree in equator in kms
#     return round(np.linalg.norm(p2 - p1) * deg_len, 3)
    return round(np.linalg.norm(p2 - p1) , 3)

def between_times(time):
    time = [int(i) for i in time.split(':')]
    time_ = 3600 * time[0] + 60 * time[1] + time[2]
    if time_ > 3600 * 7 and time_ < 3600 * 9:
        return True
    return False

In [3]:
hospitals = open('29893909/hospitals.html')
hospitals = hospitals.read()
hospitals = pd.read_html(hospitals)[0].drop(['Unnamed: 0'], axis=1)

h_id = list(hospitals.id)
lat = [round(i, 6) for i in list(hospitals.lat)]
lng = [round(i, 6) for i in list(hospitals.lng)]
h_coordinates = [np.array((la, ln)) for la, ln in zip(lat, lng)]

In [4]:
supermarkets = pd.read_excel("29893909/supermarkets.xlsx").drop(['Unnamed: 0'], axis=1)
sm_id = list(supermarkets.id)
lat = [round(i, 6) for i in list(supermarkets.lat)]
lng = [round(i, 6) for i in list(supermarkets.lng)]
sm_coordinates = [np.array((la, ln)) for la, ln in zip(lat, lng)]

In [5]:
real_state = open('29893909/real_state.xml').read()[2:-1]
soup = BeautifulSoup(real_state)
tags = [item.name for item in soup.root.childGenerator()]
data = {}
for i, child in enumerate(soup.body.root.children):
    data[tags[i]] = []
    for grandchild in child.children:
        data[tags[i]].append(grandchild.text)
real_state_xml = pd.DataFrame(data, columns=tags)
real_state_json = pd.read_json('29893909/real_state.json')
real_state = real_state_json.append(real_state_xml)

In [6]:
shoppingcenters = tabula.read_pdf('29893909/shopingcenters.pdf', pages='all')
shoppingcenters = reduce(lambda a, b: a.append(b), shoppingcenters)

sc_id = list(shoppingcenters.sc_id)
lat = [round(i, 6) for i in list(shoppingcenters.lat)]
lng = [round(i, 6) for i in list(shoppingcenters.lng)]
sc_coordinates = [np.array((la, ln)) for la, ln in zip(lat, lng)]

In [7]:
stops = pd.read_csv("GTFS - Melbourne Train Information/stops.txt")
ts_id = list(stops.stop_id)
lat = [round(i, 6) for i in list(stops.stop_lat)]
lng = [round(i, 6) for i in list(stops.stop_lon)]
ts_coordinates = [np.array((la, ln)) for la, ln in zip(lat, lng)]

In [137]:
stop_times = pd.read_csv("GTFS - Melbourne Train Information/stop_times.txt")
calendar = pd.read_csv("GTFS - Melbourne Train Information/calendar.txt")
trips = pd.read_csv("GTFS - Melbourne Train Information/trips.txt")
routes = pd.read_csv("GTFS - Melbourne Train Information/routes.txt")
shapes = pd.read_csv("GTFS - Melbourne Train Information/shapes.txt")

In [9]:
def closest(location, id, coords):
    distances = [distance(location, coord) for coord in coords]
    return id[distances.index(min(distances))], min(distances)

def coord(df, index):
    return np.array((
        df.loc[index, 'lat'], 
        df.loc[index, 'lng']
    ))

In [10]:
# Appending empty columns to the dataframe and populating them with default values
lrs = len(real_state)
real_state['suburb'] = ["not available"]*lrs
real_state['Shopping_center_id'] = ["not available"]*lrs
real_state['Distance_to_sc'] = [0]*lrs
real_state['Train_station_id'] = ["not available"]*lrs
real_state['Distance_to_train_station'] = [0]*lrs
real_state['travel_min_to_CBD'] = [0]*lrs
real_state['Transfer_flag'] = [-1]*lrs
real_state['Hospital_id'] = ["not available"]*lrs
real_state['Distance_to_hospital'] = [0]*lrs
real_state['Supermarket_id'] = ["not available"]*lrs
real_state['Distance_to_supermarket'] = [0]*lrs

In [11]:
%%time
# handling cases where the hour in time > 24
# super inefficient: change if there is time
    
for index in stop_times[stop_times.arrival_time.str.match("24:\d{1,2}:\d{1,2}")].index:
    time = stop_times.loc[index, 'arrival_time'].split(":")
    time = ":".join(["00", time[1], time[2]])
    stop_times.loc[index, 'arrival_time'] = time


for index in stop_times[stop_times.departure_time.str.match("24:\d{1,2}:\d{1,2}")].index:
    time = stop_times.loc[index, 'departure_time'].split(":")
    time = ":".join(["00", time[1], time[2]])
    stop_times.loc[index, 'departure_time'] = time
    
for index in stop_times[stop_times.arrival_time.str.match("25:\d{1,2}:\d{1,2}")].index:
    time = stop_times.loc[index, 'arrival_time'].split(":")
    time = ":".join(["01", time[1], time[2]])
    stop_times.loc[index, 'arrival_time'] = time
    
for index in stop_times[stop_times.departure_time.str.match("25:\d{1,2}:\d{1,2}")].index:
    time = stop_times.loc[index, 'departure_time'].split(":")
    time = ":".join(["01", time[1], time[2]])
    stop_times.loc[index, 'departure_time'] = time
    
for index in stop_times[stop_times.arrival_time.str.match("26:\d{1,2}:\d{1,2}")].index:
    time = stop_times.loc[index, 'arrival_time'].split(":")
    time = ":".join(["02", time[1], time[2]])
    stop_times.loc[index, 'arrival_time'] = time
    
for index in stop_times[stop_times.departure_time.str.match("26:\d{1,2}:\d{1,2}")].index:
    time = stop_times.loc[index, 'departure_time'].split(":")
    time = ":".join(["02", time[1], time[2]])
    stop_times.loc[index, 'departure_time'] = time
    
stop_times['departure_time'] = pd.to_datetime(stop_times['departure_time'], format='%H:%M:%S').dt.time
stop_times['arrival_time'] = pd.to_datetime(stop_times['arrival_time'], format='%H:%M:%S').dt.time 

CPU times: user 1min 9s, sys: 132 ms, total: 1min 10s
Wall time: 1min 10s


In [12]:
start = pd.to_datetime("07:00:00", format="%H:%M:%S").time()
end = pd.to_datetime("09:00:00", format="%H:%M:%S").time()

stop_times_7_9 = stop_times[(stop_times['departure_time'] > start) & (stop_times['departure_time'] < end) & 
           (stop_times['arrival_time'] > start) &  (stop_times['arrival_time'] < end)]

In [13]:
trips_flinders_street = trips[trips.trip_headsign=='City (Flinders Street)']

In [14]:
# Transfer_flag
transfer_flag = {}
stop_times_trips = pd.merge(left=stop_times_7_9, right=trips_flinders_street, on='trip_id')
for stop in list(stops.stop_id):
    if not stop_times_trips[stop_times_trips.stop_id == stop].empty:
        transfer_flag[stop] = 1
    else:
        transfer_flag[stop] = 0

In [15]:
# Suburb
shp = shapefile.Reader('vic_suburb_boundary/VIC_LOCALITY_POLYGON_shp.dbf') #open the shapefile
all_shapes = shp.shapes() # get all the polygons
all_records = shp.records()     

def get_suburb(inp):
#     point, property_id = inp
    try:
        property_id, point = inp
        for i in range(len(all_shapes)):
            boundary = all_shapes[i] # get a boundary polygon
            if Point(point).within(shape(boundary)):
                return (property_id, all_records[i].as_dict()['VIC_LOCA_2'])
        return ("", "")
    except:
        pass

In [16]:
# Locating the suburb a property is situated in 
if 'prop2sub.json' not in os.listdir("."):
    lat = list(real_state.lat)
    lng = list(real_state.lng)
    coords = [(i, j) for i, j in zip(lng, lat)]
    property_ids = list(real_state.property_id)
    inp = [(pid, coord) for pid, coord in zip(property_ids, coords)]
    
    with Pool(4) as p:
        property2suburb = p.map(get_suburb, inp)
else:
    with open('prop2sub.json', 'r') as f:
        property2suburb = json.load(f)
    prop2sub = {}
    for i in property2suburb:
        try:
            prop2sub[i[0]] = i[1]
        except:
            pass

In [146]:
routes

Unnamed: 0,route_id,agency_id,route_short_name,route_long_name,route_type
0,2-ALM-B-mjp-1,1,Alamein,Alamein - City (Flinders Street),2
1,2-ALM-C-mjp-1,1,Alamein,Alamein - City (Flinders Street),2
2,2-ALM-D-mjp-1,1,Alamein,Alamein - City (Flinders Street),2
3,2-ALM-E-mjp-1,1,Alamein,Alamein - City (Flinders Street),2
4,2-ALM-F-mjp-1,1,Alamein,Alamein - City (Flinders Street),2
...,...,...,...,...,...
76,2-WMN-B-mjp-1,1,Williamstown,Williamstown - City (Flinders Street),2
77,2-WMN-C-mjp-1,1,Williamstown,Williamstown - City (Flinders Street),2
78,2-WMN-D-mjp-1,1,Williamstown,Williamstown - City (Flinders Street),2
79,2-WMN-E-mjp-1,1,Williamstown,Williamstown - City (Flinders Street),2


In [25]:
def in_weekdays(service_id):
    temp = 0
    for i in ["monday", "tuesday", "wednesday", "thursday", "friday"]: 
        temp += calendar[calendar.service_id==service_id][i].values[0]
    if temp > 0:
        return True
    return False

In [145]:
trip = trips.loc[0]
shapes[shapes.shape_id == trip.shape_id]

Unnamed: 0,shape_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence,shape_dist_traveled
7246,2-ALM-F-mjp-1.1.H,-37.86832,145.079656,1,0.0
7247,2-ALM-F-mjp-1.1.H,-37.864367,145.080395,2,443.549321
7248,2-ALM-F-mjp-1.1.H,-37.863775,145.080558,3,510.817441
7249,2-ALM-F-mjp-1.1.H,-37.861968,145.081344,4,723.017818
7250,2-ALM-F-mjp-1.1.H,-37.861968,145.081344,5,723.017818
7251,2-ALM-F-mjp-1.1.H,-37.858298,145.082928,6,1153.437381
7252,2-ALM-F-mjp-1.1.H,-37.857984,145.083004,7,1188.947943
7253,2-ALM-F-mjp-1.1.H,-37.857256,145.083103,8,1270.249849
7254,2-ALM-F-mjp-1.1.H,-37.856264,145.083072,9,1380.363427
7255,2-ALM-F-mjp-1.1.H,-37.855459,145.082876,10,1471.368921


In [140]:
%%time
# Checking for stops where multiple trains were taken to reach Flinders Street
sids = list(stops.stop_id.unique())
for sid in sids:
    tids = list(stop_times_7_9[stop_times_7_9.stop_id == sid].trip_id.unique())
    for tid in tids:
        l = list(trips_flinders_street[trips_flinders_street.trip_id == tid].route_id.unique())
        if len(l) > 1:
            print(sid)

CPU times: user 48.4 s, sys: 16.6 ms, total: 48.4 s
Wall time: 48.4 s


In [135]:
# Only direct trains were found from all stops to Flinders Street

def travel_min_to_CBD(stop_id):
    times = []
    ts = list(stop_times_7_9[stop_times_7_9.stop_id == stop_id].trip_id.unique())
    # Ignoring all trips that don't occur on weekdays
    for t in ts:
        sid = trips[trips.trip_id == t].service_id.values[0]
        if not in_weekdays(sid):
            ts.remove(t)
            
    for t in ts:
        try:
            st_time = stop_times_7_9[(stop_times_7_9.trip_id == t) & (stop_times_7_9.stop_id == stop_id)].departure_time
            re_time = stop_times_7_9[(stop_times_7_9.trip_id == t) & (stop_times_7_9.stop_id == 19854)].arrival_time
            re_time = pd.to_datetime(re_time, format='%H:%M:%S').values[0]
            st_time = pd.to_datetime(st_time, format='%H:%M:%S').values[0]
            times.append(pd.Timedelta(re_time - st_time).seconds / 60)
        except Exception as e:
            pass
    try:
        return (stop_id, round(sum(times) / len(times), 2))
    except:
        return (stop_id, None)

In [133]:
%%time
stop_ids = list(stops.stop_id.unique())
with Pool(4) as p:
    stopid2cbd = p.map(travel_min_to_CBD, stop_ids)

CPU times: user 66.6 ms, sys: 56.2 ms, total: 123 ms
Wall time: 2min 11s


In [61]:
stid2cbd = {}
for i in stopid2cbd:
    try:
        stid2cbd[i[0]] = i[1]
    except:
        pass

In [62]:
for i in list(stid2cbd.keys()):
    if stid2cbd[i] is None:
        print(stops[stops.stop_id == i].stop_name)

2    Stony Point Railway Station
Name: stop_name, dtype: object
3    Crib Point Railway Station
Name: stop_name, dtype: object
4    Morradoo Railway Station
Name: stop_name, dtype: object
5    Bittern Railway Station
Name: stop_name, dtype: object
6    Hastings Railway Station
Name: stop_name, dtype: object
7    Tyabb Railway Station
Name: stop_name, dtype: object
8    Somerville Railway Station
Name: stop_name, dtype: object
9    Baxter Railway Station
Name: stop_name, dtype: object
11    Leawarra Railway Station
Name: stop_name, dtype: object
195    Flemington Racecourse Railway Station
Name: stop_name, dtype: object
196    Showgrounds Railway Station
Name: stop_name, dtype: object


In [63]:
%%time
for index in real_state.index:
    try:
        lat = round(real_state.loc[index, 'lat'].values[0], 4)
        lng = round(real_state.loc[index, 'lng'].values[0], 4)
        real_state.loc[index, 'lat'] = lat
        real_state.loc[index, 'lng'] = lng
    except:
        lat = round(real_state.loc[index, 'lat'], 4)
        lng = round(real_state.loc[index, 'lng'], 4)
        real_state.loc[index, 'lat'] = lat
        real_state.loc[index, 'lng'] = lng

CPU times: user 2.34 s, sys: 1.64 ms, total: 2.34 s
Wall time: 2.34 s


In [64]:
%%time
for index in real_state.index:
    try:
        try:
            lat = real_state.loc[index, 'lat'].values[0]
            lng = real_state.loc[index, 'lng'].values[0]
            location = np.array((lat, lng))
        except:
            lat = real_state.loc[index, 'lat']
            lng = real_state.loc[index, 'lng']
            location = np.array((lat, lng))
            
        try:
            real_state.loc[index, 'suburb'] = prop2sub[real_state.loc[index, 'property_id'].values[0]]
        except Exception as e:
            real_state.loc[index, 'suburb'] = prop2sub[real_state.loc[index, 'property_id']]
        
        id, dist = closest(location, sc_id, sc_coordinates)
        real_state.loc[index, 'Shopping_center_id'] = id
        real_state.loc[index, 'Distance_to_sc'] = dist
        
        id, dist = closest(location, ts_id, ts_coordinates)
        real_state.loc[index, 'Train_station_id'] = id
        real_state.loc[index, 'Distance_to_train_station'] = dist
        
        real_state.loc[index, 'Transfer_flag'] = transfer_flag[id]
        try:
            real_state.loc[index, 'travel_min_to_CBD'] = stid2cbd[id]
        except Exception as e:
            print(e, index)
            print(real_state.loc[index])
            real_state.loc[index, 'travel_min_to_CBD'] = 0
            
        
        id, dist = closest(location, h_id, h_coordinates)
        real_state.loc[index, 'Hospital_id'] = id
        real_state.loc[index, 'Distance_to_hospital'] = dist

        id, dist = closest(location, sm_id, sm_coordinates)
        real_state.loc[index, 'Supermarket_id'] = id
        real_state.loc[index, 'Distance_to_supermarket'] = dist
    except Exception as e:
        pass

CPU times: user 23 s, sys: 0 ns, total: 23 s
Wall time: 23 s


In [65]:
real_state.columns

Index(['property_id', 'lat', 'lng', 'addr_street', 'price', 'property_type',
       'year', 'bedrooms', 'bathrooms', 'parking_space', 'suburb',
       'Shopping_center_id', 'Distance_to_sc', 'Train_station_id',
       'Distance_to_train_station', 'travel_min_to_CBD', 'Transfer_flag',
       'Hospital_id', 'Distance_to_hospital', 'Supermarket_id',
       'Distance_to_supermarket'],
      dtype='object')

In [66]:
cols = ['property_id', 'lat', 'lng', 'price', 'year', 'bedrooms', 'bathrooms', 'parking_space', 
      'Distance_to_sc', 'Train_station_id', 'Distance_to_train_station', 'travel_min_to_CBD', 
        'Transfer_flag', 'Distance_to_hospital', 'Distance_to_supermarket']

In [67]:
for col in cols:
    real_state[col] = pd.to_numeric(real_state[col])

In [68]:
columns = ['Distance_to_sc', 'Distance_to_hospital', 'travel_min_to_CBD', 'price']

In [69]:
real_state.price = pd.to_numeric(real_state.price)

In [70]:
real_state[columns].describe()

Unnamed: 0,Distance_to_sc,Distance_to_hospital,travel_min_to_CBD,price
count,2011.0,2011.0,2011.0,2011.0
mean,0.025211,0.02346,35.840632,8980855.0
std,0.013113,0.018342,23.930222,5632456.0
min,0.002,0.001,6.0,1600000.0
25%,0.015,0.011,24.88,5164000.0
50%,0.023,0.018,34.2,7350000.0
75%,0.034,0.029,44.24,10960000.0
max,0.065,0.094,300.39,41600000.0


# Data Reshaping

## Standardization

In [71]:
standard_scaler = StandardScaler()
real_state_standard = real_state.copy()
columns = ['price', 'Distance_to_sc', 'Distance_to_hospital', 'travel_min_to_CBD']
real_state_standard[columns] = standard_scaler.fit_transform(real_state_standard[columns])

In [72]:
real_state_standard[columns].describe()

Unnamed: 0,price,Distance_to_sc,Distance_to_hospital,travel_min_to_CBD
count,2011.0,2011.0,2011.0,2011.0
mean,0.0,-1.6783080000000002e-17,-8.921534000000001e-17,-3.020955e-16
std,1.000249,1.000249,1.000249,1.000249
min,-1.310741,-1.770505,-1.224872,-1.247295
25%,-0.677822,-0.7788964,-0.6795261,-0.4581386
50%,-0.289618,-0.1686755,-0.2977842,-0.06857603
75%,0.35147,0.6703781,0.302096,0.3510815
max,5.792723,3.034984,3.846842,11.05778


In [73]:
real_state_standard[columns]

Unnamed: 0,price,Distance_to_sc,Distance_to_hospital,travel_min_to_CBD
0,-0.316256,-1.084007,-0.679526,-0.324801
1,1.876938,1.356877,-0.297784,0.584317
2,0.115279,-1.389117,-0.624992,-0.234098
3,-0.284290,-0.092398,-0.461388,-0.741951
4,0.635253,0.212712,-0.788595,0.179289
...,...,...,...,...
996,-0.230304,-0.168676,0.138492,-0.044751
997,0.176547,1.051766,-0.897664,0.417123
998,-0.952371,1.280599,-0.079646,-0.069830
999,-1.055727,0.899211,0.465700,0.415451


In [74]:
minmax_scaler = MinMaxScaler()
real_state_minmax = real_state.copy()
real_state_minmax[columns] = standard_scaler.fit_transform(real_state_minmax[columns])

In [75]:
real_state_minmax[columns].describe()

Unnamed: 0,price,Distance_to_sc,Distance_to_hospital,travel_min_to_CBD
count,2011.0,2011.0,2011.0,2011.0
mean,0.0,-1.6783080000000002e-17,-8.921534000000001e-17,-3.020955e-16
std,1.000249,1.000249,1.000249,1.000249
min,-1.310741,-1.770505,-1.224872,-1.247295
25%,-0.677822,-0.7788964,-0.6795261,-0.4581386
50%,-0.289618,-0.1686755,-0.2977842,-0.06857603
75%,0.35147,0.6703781,0.302096,0.3510815
max,5.792723,3.034984,3.846842,11.05778


In [76]:
real_state_minmax[columns]

Unnamed: 0,price,Distance_to_sc,Distance_to_hospital,travel_min_to_CBD
0,-0.316256,-1.084007,-0.679526,-0.324801
1,1.876938,1.356877,-0.297784,0.584317
2,0.115279,-1.389117,-0.624992,-0.234098
3,-0.284290,-0.092398,-0.461388,-0.741951
4,0.635253,0.212712,-0.788595,0.179289
...,...,...,...,...
996,-0.230304,-0.168676,0.138492,-0.044751
997,0.176547,1.051766,-0.897664,0.417123
998,-0.952371,1.280599,-0.079646,-0.069830
999,-1.055727,0.899211,0.465700,0.415451


In [77]:
# log_normalized = real_state.copy()
# for index in log_normalized.index:
#     for column in columns:
#         val = log_normalized.loc[index, column]
#         try:
#             log_normalized.loc[index, column] = np.log(val)
#         except:
#             log_normalized.loc[index, column] = np.log(val.values[0])

In [78]:
# log_normalized[columns]

In [79]:
real_state_boxcox = real_state.copy()
pt = PowerTransformer(method='box-cox')
normalizer = Normalizer()

real_state_boxcox[columns] = normalizer.fit_transform(real_state_boxcox[columns])
# real_state_boxcox[[columns]] = pt.fit_transform(real_state_boxcox[columns])
columns

['price', 'Distance_to_sc', 'Distance_to_hospital', 'travel_min_to_CBD']

In [80]:
X = real_state[['Distance_to_sc', 'Distance_to_hospital', 'travel_min_to_CBD']]
y = real_state['price']

X_bc = real_state_boxcox[['Distance_to_sc', 'Distance_to_hospital', 'travel_min_to_CBD']]
y_bc = real_state_boxcox['price']

In [81]:
model = sm.OLS(y, X).fit()
predictions = model.predict(X)
model_bc = sm.OLS(y_bc, X_bc).fit()
predictions = model_bc.predict(X_bc)

In [82]:
X.describe()

Unnamed: 0,Distance_to_sc,Distance_to_hospital,travel_min_to_CBD
count,2011.0,2011.0,2011.0
mean,0.025211,0.02346,35.840632
std,0.013113,0.018342,23.930222
min,0.002,0.001,6.0
25%,0.015,0.011,24.88
50%,0.023,0.018,34.2
75%,0.034,0.029,44.24
max,0.065,0.094,300.39


In [83]:
model.summary()

0,1,2,3
Dep. Variable:,price,R-squared (uncentered):,0.583
Model:,OLS,Adj. R-squared (uncentered):,0.583
Method:,Least Squares,F-statistic:,937.3
Date:,"Mon, 23 Nov 2020",Prob (F-statistic):,0.0
Time:,12:48:25,Log-Likelihood:,-34504.0
No. Observations:,2011,AIC:,69010.0
Df Residuals:,2008,BIC:,69030.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Distance_to_sc,1.798e+08,9.18e+06,19.596,0.000,1.62e+08,1.98e+08
Distance_to_hospital,2.594e+07,8.06e+06,3.220,0.001,1.01e+07,4.17e+07
travel_min_to_CBD,6.604e+04,5644.451,11.699,0.000,5.5e+04,7.71e+04

0,1,2,3
Omnibus:,382.431,Durbin-Watson:,1.969
Prob(Omnibus):,0.0,Jarque-Bera (JB):,831.925
Skew:,1.084,Prob(JB):,2.2400000000000002e-181
Kurtosis:,5.286,Cond. No.,2960.0


In [84]:
# Linear Regression without transformation

In [85]:
lm = linear_model.LinearRegression()
lm_bc = linear_model.LinearRegression()
l_model = lm.fit(X, y)
l_bc = lm_bc.fit(X_bc, y_bc)

In [86]:
predictions = lm.predict(X)
predictions_bc = lm_bc.predict(X_bc)

In [87]:
lm_bc.score(X, y)

-2.543639730249931

In [88]:
X = real_state_minmax[['Distance_to_sc', 'Distance_to_hospital', 'travel_min_to_CBD']]
y = real_state_minmax['price']

In [89]:
lm = linear_model.LinearRegression()
l_model = lm.fit(X, y)
predictions = lm.predict(X)
lm.score(X, y)

0.02844432682545428

In [90]:
real_state.bedrooms.unique()

array([3, 4, 2, 5, 1])