# Make the hourly weather and traffic db
I plan to do the following here
- pull in all of the hourly weather from 2015 to 2020 from openweather
- pull in all of the traffic from chicago data portal
- merge the two with a date as primary key
- also include cols for year month day

I will need to go back and do the following:
- add a shared key between traffic and 

In [154]:
#!/usr/bin/env python

import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from modules.myfuncs import *
from sodapy import Socrata
import numpy as np
from scipy import stats



In [128]:
conn = create_connection('database/rlc.db')  # function from myfuncs file
c = conn.cursor()

sqlite3 version: 2.6.0
connected to database/rlc.db


In [129]:
sql_fetch_tables(c, conn)

[('cam_startend',), ('cam_locations',), ('intersection_locations',), ('daily_violations',), ('intersection_cams',), ('signal_crashes',), ('hourly_congestion',), ('hourly_weather',)]


In [130]:
# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:

url = "data.cityofchicago.org"
client = Socrata(url, None)

# Example authenticated client (needed for non-public datasets):
# client = Socrata(data.cityofchicago.org,
#                  MyAppToken,
#                  userame="user@example.com",
#                  password="AFakePassword")



In [5]:
# First 2000 results, returned as JSON from API / converted to Python list of
# dictionaries by sodapy.
# data website https://data.cityofchicago.org/resource/85ca-t3if.json

traffic_data = client.get("emtn-qqdi", 
                     where="TIME > \'2015-01-01T00:00:00.000\'",
                     limit=10000000,
                    )

# Convert to pandas DataFrame
traffic_data = pd.DataFrame.from_records(traffic_data)


In [9]:
traffic_data.head()

Unnamed: 0,time,region_id,bus_count,number_of_reads,speed,year,month,day_of_week,day,hour
0,2015-01-01 00:01:17,23,37,332,28.64,2015,1,5,1,0
1,2015-01-01 00:01:17,24,21,411,27.27,2015,1,5,1,0
2,2015-01-01 00:01:17,6,17,303,25.23,2015,1,5,1,0
3,2015-01-01 00:01:17,14,16,203,25.09,2015,1,5,1,0
4,2015-01-01 00:01:17,18,24,432,27.89,2015,1,5,1,0


In [12]:
traffic_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4168199 entries, 0 to 4168198
Data columns (total 10 columns):
time               datetime64[ns]
region_id          object
bus_count          object
number_of_reads    object
speed              float64
year               int64
month              int64
day_of_week        int64
day                int64
hour               int64
dtypes: datetime64[ns](1), float64(1), int64(5), object(3)
memory usage: 318.0+ MB


In [8]:
# ex: df['Date']= pd.to_datetime(df['Date'])
traffic_data['time'] = pd.to_datetime(traffic_data['time'])

traffic_data['year'] = traffic_data['time'].apply(lambda x: x.year)
traffic_data['month'] = traffic_data['time'].apply(lambda x: x.month)
traffic_data['day_of_week'] = traffic_data['time'].dt.dayofweek

# need to convert this to the dataset convention 1 for Sunday, 7 for Saturday
traffic_data['day_of_week'] = traffic_data['day_of_week'].apply(lambda x: (x + 2) % 7)

traffic_data['day'] = traffic_data['time'].apply(lambda x: x.day)
traffic_data['hour'] = traffic_data['time'].apply(lambda x: x.hour)
traffic_data['speed'] = traffic_data['speed'].apply(float)

In [13]:
traffic_data.head()

Unnamed: 0,time,region_id,bus_count,number_of_reads,speed,year,month,day_of_week,day,hour
0,2015-01-01 00:01:17,23,37,332,28.64,2015,1,5,1,0
1,2015-01-01 00:01:17,24,21,411,27.27,2015,1,5,1,0
2,2015-01-01 00:01:17,6,17,303,25.23,2015,1,5,1,0
3,2015-01-01 00:01:17,14,16,203,25.09,2015,1,5,1,0
4,2015-01-01 00:01:17,18,24,432,27.89,2015,1,5,1,0


In [14]:
# now make it only have one entry per hour
agg_dict = {'speed':np.mean}
traffic_grouped = traffic_data.groupby(['year', 'month', 'day', 'hour', 'region_id', 'day_of_week']).agg(agg_dict).reset_index()


In [15]:
traffic_grouped.head(20)  # YES!!! That's what I want

Unnamed: 0,year,month,day,hour,region_id,day_of_week,speed
0,2015,1,1,0,1,5,27.455
1,2015,1,1,0,10,5,25.796667
2,2015,1,1,0,11,5,25.816667
3,2015,1,1,0,12,5,18.636667
4,2015,1,1,0,13,5,20.681667
5,2015,1,1,0,14,5,26.408333
6,2015,1,1,0,15,5,26.035
7,2015,1,1,0,16,5,39.313333
8,2015,1,1,0,17,5,21.648333
9,2015,1,1,0,18,5,26.931667


In [16]:
traffic_grouped[traffic_grouped['year']==2018]['month'].max()  # this data only goes up to May 2018
# we have an overlap in the dbs from March through April of 2018.  
# WIll result in double entries, and we will have to grouby mean speed again to fix it.
# It makes sense to just


5

In [17]:
#  results, returned as JSON from API / converted to Python list of
# dictionaries by sodapy.
# data website https://data.cityofchicago.org/resource/85ca-t3if.json

# 15 MINUTES

new_traffic_data = client.get("kf7e-cur8", #2018 to present
                     where="TIME < \'2021-01-01T00:00:00.000\'",
                     limit=10000000,
                    )

# Convert to pandas DataFrame
new_traffic_data = pd.DataFrame.from_records(new_traffic_data)

In [18]:
new_traffic_data.head()

Unnamed: 0,time,region_id,speed,region,bus_count,num_reads,hour,day_of_week,month,description,record_id,west,east,south,north,nw_location,se_location
0,2018-03-31T14:40:28.000,2,27.95,Far North West,25,493,14,7,3,North of Montrose. East River to Cicero,02-201803311940,-87.84621,-87.747456,41.960669,42.0190998,"{'type': 'Point', 'coordinates': [-87.84621, 4...","{'type': 'Point', 'coordinates': [-87.747456, ..."
1,2018-03-31T14:40:28.000,3,19.09,North Park-Albany-Linconl Sq,48,834,14,7,3,Montrose to Devon. Cicero to Ravenswood,03-201803311940,-87.747456,-87.67459,41.960669,41.997946,"{'type': 'Point', 'coordinates': [-87.747456, ...","{'type': 'Point', 'coordinates': [-87.67459, 4..."
2,2018-03-31T14:40:28.000,4,18.41,Edge Water-Uptown,36,585,14,7,3,Montrose to Devon. Ravenswood to Lake Shore,04-201803311940,-87.67459,-87.646438,41.960669,41.997946,"{'type': 'Point', 'coordinates': [-87.67459, 4...","{'type': 'Point', 'coordinates': [-87.646438, ..."
3,2018-03-31T14:40:28.000,6,20.45,Irving Park-Avondale-North Ctr,48,811,14,7,3,Diversey to Montrose. Cicero to Ravenswood,06-201803311940,-87.747456,-87.67459,41.931841,41.960669,"{'type': 'Point', 'coordinates': [-87.747456, ...","{'type': 'Point', 'coordinates': [-87.67459, 4..."
4,2018-03-31T14:40:28.000,7,18.41,Hermosa-Logan Square,47,817,14,7,3,North Ave to Diversy. Cicero to Ravenswood,07-201803311940,-87.747456,-87.67459,41.909269,41.931841,"{'type': 'Point', 'coordinates': [-87.747456, ...","{'type': 'Point', 'coordinates': [-87.67459, 4..."


In [19]:
try:
    new_traffic_data.drop(columns=['region', 'bus_count', 'num_reads', 'record_id', 'nw_location', 'se_location'], inplace=True)
except:
    pass
new_traffic_data.head()

Unnamed: 0,time,region_id,speed,hour,day_of_week,month,description,west,east,south,north
0,2018-03-31T14:40:28.000,2,27.95,14,7,3,North of Montrose. East River to Cicero,-87.84621,-87.747456,41.960669,42.0190998
1,2018-03-31T14:40:28.000,3,19.09,14,7,3,Montrose to Devon. Cicero to Ravenswood,-87.747456,-87.67459,41.960669,41.997946
2,2018-03-31T14:40:28.000,4,18.41,14,7,3,Montrose to Devon. Ravenswood to Lake Shore,-87.67459,-87.646438,41.960669,41.997946
3,2018-03-31T14:40:28.000,6,20.45,14,7,3,Diversey to Montrose. Cicero to Ravenswood,-87.747456,-87.67459,41.931841,41.960669
4,2018-03-31T14:40:28.000,7,18.41,14,7,3,North Ave to Diversy. Cicero to Ravenswood,-87.747456,-87.67459,41.909269,41.931841


In [20]:
# ex: df['Date']= pd.to_datetime(df['Date'])
new_traffic_data['time'] = pd.to_datetime(new_traffic_data['time'])
new_traffic_data['year'] = new_traffic_data['time'].apply(lambda x: int(x.year))
new_traffic_data['day'] = new_traffic_data['time'].apply(lambda x: int(x.day))
new_traffic_data['hour'] = new_traffic_data['hour'].apply(int)
new_traffic_data['month'] = new_traffic_data['month'].apply(int)
new_traffic_data['speed'] = new_traffic_data['speed'].apply(float)
new_traffic_data['day_of_week'] = new_traffic_data['day_of_week'].apply(int)

In [42]:
#new_traffic_data['month'] = new_traffic_data['month'].apply(int)
#new_traffic_data['day_of_week'] = new_traffic_data['day_of_week'].apply(int)

In [43]:
# now make it only have one entry per hour
# This code takes about 10min to run
new_traffic_grouped = new_traffic_data.groupby(['year', 'month', 'day', 'hour', 'region_id', 'day_of_week']).agg({'speed':np.mean}).reset_index()


In [44]:
new_traffic_grouped.head()
new_traffic_grouped.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 693883 entries, 0 to 693882
Data columns (total 7 columns):
year           693883 non-null int64
month          693883 non-null int64
day            693883 non-null int64
hour           693883 non-null int64
region_id      693883 non-null object
day_of_week    693883 non-null int64
speed          693883 non-null float64
dtypes: float64(1), int64(5), object(1)
memory usage: 37.1+ MB


In [45]:
new_traffic_grouped[new_traffic_grouped['year']==2018]['month'].min()  # this data only goes back to March 2018
# looks like we have a gap of three months.  Should fill it in with dummy data.

3

# now merge the two datasets (2015-2018 and 2018-2020)

In [46]:
all_traffic_grouped = pd.concat([traffic_grouped, new_traffic_grouped], ignore_index=False)

In [47]:
all_traffic_grouped.head()

Unnamed: 0,year,month,day,hour,region_id,day_of_week,speed
0,2015,1,1,0,1,5,27.455
1,2015,1,1,0,10,5,25.796667
2,2015,1,1,0,11,5,25.816667
3,2015,1,1,0,12,5,18.636667
4,2015,1,1,0,13,5,20.681667


In [48]:
print(len(all_traffic_grouped))
print(len(traffic_grouped) + len(new_traffic_grouped))  # concatenation successful
print(all_traffic_grouped['day_of_week'].unique())

1368684
1368684
[5 6 0 1 2 3 4 7]


In [49]:
traffic_grouped.info()
new_traffic_grouped.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 674801 entries, 0 to 674800
Data columns (total 7 columns):
year           674801 non-null int64
month          674801 non-null int64
day            674801 non-null int64
hour           674801 non-null int64
region_id      674801 non-null object
day_of_week    674801 non-null int64
speed          674801 non-null float64
dtypes: float64(1), int64(5), object(1)
memory usage: 36.0+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 693883 entries, 0 to 693882
Data columns (total 7 columns):
year           693883 non-null int64
month          693883 non-null int64
day            693883 non-null int64
hour           693883 non-null int64
region_id      693883 non-null object
day_of_week    693883 non-null int64
speed          693883 non-null float64
dtypes: float64(1), int64(5), object(1)
memory usage: 37.1+ MB


In [50]:
all_traffic_grouped['day_of_week'].isna().sum()

0

In [51]:
print(all_traffic_grouped[(all_traffic_grouped['year']==2018) & \
                    (all_traffic_grouped['month']==4) & \
                    (all_traffic_grouped['day']==5) & \
                    (all_traffic_grouped['hour']==5) & \
                    (all_traffic_grouped['region_id']=='1')
                   ])

# I have some doubles.  My problem is the WRONG DAY OF WEEK!!!!
# Thursday is the correct one here.


print(traffic_grouped[(traffic_grouped['year']==2018) & \
                (traffic_grouped['month']==4) & \
                (traffic_grouped['day']==5) & \
                (traffic_grouped['hour']==5) & \
                (traffic_grouped['region_id']=='1')
               ])


new_traffic_grouped[(new_traffic_grouped['year']==2018) & \
                    (new_traffic_grouped['month']==4) & \
                    (new_traffic_grouped['day']==5) & \
                    (new_traffic_grouped['hour']==5) & \
                    (new_traffic_grouped['region_id']=='1')
                   ]

# for new data, sunday = 1, so thursday = 5 which is the correct number.
# From datetime docs:
# dayofweek method Return the dayof the week. 
# It is assumed the week starts on Monday, which is denoted by 0 and ends on Sunday which is denoted by 6. 
# So let's go back and make it all the same.  I prefer to use the 0 Sunday (traffic_grouped).  
# Will Go back above and fix new_traffic by subtracting

        year  month  day  hour region_id  day_of_week      speed
656270  2018      4    5     5         1            5  24.091667
18473   2018      4    5     5         1            5  24.091667
        year  month  day  hour region_id  day_of_week      speed
656270  2018      4    5     5         1            5  24.091667


Unnamed: 0,year,month,day,hour,region_id,day_of_week,speed
18473,2018,4,5,5,1,5,24.091667


In [52]:
all_traffic_grouped = all_traffic_grouped \
                    .groupby(['year', 'month', 'day', 'hour', 'region_id', 'day_of_week']) \
                    .agg({'speed':np.mean}) \
                    .reset_index()

In [64]:
print(24 * 29 * 365) # estimated total entries (24 hours, 29 regions, 365 days, 5 years)
print(len(all_traffic_grouped)) # difference is 
all_traffic_grouped.head() 

# just want to see how many I have each year
all_traffic_grouped.groupby('year').count()

# looks like 2015 doesn't have all the data I need.  When does the data start or is it just sparse?
print(all_traffic_grouped[all_traffic_grouped['year']==2015]['month'].min())

# so it does start at beginning of year, just missing some.  We may have to fill in some data?

254040
1337103
1


In [65]:
print(new_traffic_grouped['year'].min(), new_traffic_grouped['year'].max())  # this data only goes back to March 2018
print(traffic_grouped['year'].min(), traffic_grouped['year'].max())
print(all_traffic_grouped['year'].min(), all_traffic_grouped['year'].max())


      

2018 2020
2015 2018
2015 2020


## Put the table in the db


In [76]:
def create_table(c, mytable, cols):
    '''
    Creates a new table 
    #Sets a constraint on db (UNIQUE) so that if you have camera_id and violation_date the same...
    '''
    
    flat_cols = []
    for col in cols:
        for item in col:
            flat_cols.append(item) 
    
    my_sql = 'CREATE TABLE {} ' + '(' + '{} ' * len(flat_cols) + ');'
    my_sql = my_sql.format(mytable, *flat_cols)  # insert datatypes for cols

    print(my_sql) # just to see what I'm doing

    
    try:     
        c.execute(my_sql) 
    except Exception as e:
        print('\nCREATE TABLE', mytable, 'FAILED!!',  e)


        
cols = [
        ['year', 'int'], 
        ['month', 'int'],
        ['day', 'int'],
        ['hour', 'int'],
        ['region_id', 'text'],
        ['day_of_week', 'int'],
        ['speed', 'float'],
        ]


create_table(c, 'hourly_congestion', cols)
print()

#c.execute('DROP TABLE hourly_congestion')  # in case you need to delete the table and overwrite
conn.commit()

CREATE TABLE hourly_congestion (year int month int day int hour int region_id text day_of_week int speed float );

CREATE TABLE hourly_congestion FAILED!! table hourly_congestion already exists



In [77]:
sql_fetch_tables(c, conn)  # helper function in myfuncs
delete_all_entries(c, conn, 'hourly_congestion')
all_traffic_grouped.to_sql('hourly_congestion', conn, if_exists='replace', index = False)


[('cam_startend',), ('cam_locations',), ('intersection_locations',), ('daily_violations',), ('intersection_cams',), ('signal_crashes',), ('hourly_congestion',)]


In [123]:
query = c.execute("SELECT region_id FROM hourly_congestion;").fetchall()
print(query[:5])
print(len(query))

[('1',), ('10',), ('11',), ('12',), ('13',)]
1337103


<sqlite3.Cursor at 0x7fc322a968f0>

In [79]:
all_traffic_grouped.columns

Index(['year', 'month', 'day', 'hour', 'region_id', 'day_of_week', 'speed'], dtype='object')

## Now to weather
Weather is from open weather.  Five years of Chicago hourly weather data.
Weather data is also 

Columns
- rain_1h/rain_3h (mm rain in last hour/3hour)
- snow_1h/snow_3h (mm (as liquid) snow in last hour/3hour)
- temp (max temp over that hour)
- year, month, day (integers)


In [80]:
wx_df = pd.read_csv('data/chi_wx.csv')

In [81]:
wx_df.head()  # temps are in kelvin

Unnamed: 0,dt,dt_iso,timezone,city_name,lat,lon,temp,feels_like,temp_min,temp_max,...,wind_deg,rain_1h,rain_3h,snow_1h,snow_3h,clouds_all,weather_id,weather_main,weather_description,weather_icon
0,1420070400,2015-01-01 00:00:00 +0000 UTC,-21600,Chicago IL. USA,41.878114,-87.629798,265.96,258.16,264.85,267.708,...,230,,,,,1,800,Clear,sky is clear,01n
1,1420074000,2015-01-01 01:00:00 +0000 UTC,-21600,Chicago IL. USA,41.878114,-87.629798,266.13,256.52,265.35,267.926,...,230,,,,,20,801,Clouds,few clouds,02n
2,1420077600,2015-01-01 02:00:00 +0000 UTC,-21600,Chicago IL. USA,41.878114,-87.629798,266.17,257.7,265.35,268.098,...,230,,,,,20,801,Clouds,few clouds,02n
3,1420081200,2015-01-01 03:00:00 +0000 UTC,-21600,Chicago IL. USA,41.878114,-87.629798,266.39,257.56,265.35,268.157,...,240,,,,,1,800,Clear,sky is clear,01n
4,1420084800,2015-01-01 04:00:00 +0000 UTC,-21600,Chicago IL. USA,41.878114,-87.629798,266.47,256.5,265.35,268.121,...,240,,,,,1,800,Clear,sky is clear,01n


In [82]:
wx_df.columns

Index(['dt', 'dt_iso', 'timezone', 'city_name', 'lat', 'lon', 'temp',
       'feels_like', 'temp_min', 'temp_max', 'pressure', 'sea_level',
       'grnd_level', 'humidity', 'wind_speed', 'wind_deg', 'rain_1h',
       'rain_3h', 'snow_1h', 'snow_3h', 'clouds_all', 'weather_id',
       'weather_main', 'weather_description', 'weather_icon'],
      dtype='object')

In [85]:
wx_df['time'] = pd.to_datetime(wx_df['dt_iso'].apply(lambda x: x[:-4]))
wx_df.head()
wx_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55219 entries, 0 to 55218
Data columns (total 26 columns):
dt                     55219 non-null int64
dt_iso                 55219 non-null object
timezone               55219 non-null int64
city_name              55219 non-null object
lat                    55219 non-null float64
lon                    55219 non-null float64
temp                   55219 non-null float64
feels_like             55219 non-null float64
temp_min               55219 non-null float64
temp_max               55219 non-null float64
pressure               55219 non-null int64
sea_level              0 non-null float64
grnd_level             0 non-null float64
humidity               55219 non-null int64
wind_speed             55219 non-null float64
wind_deg               55219 non-null int64
rain_1h                6587 non-null float64
rain_3h                816 non-null float64
snow_1h                1538 non-null float64
snow_3h                91 non-null float6

In [104]:
wx_df.rain_3h.describe()
wx_df['rain_3h'] = wx_df['rain_3h'].fillna(0)
wx_df['rain_1h'] = wx_df['rain_1h'].fillna(0)
wx_df['snow_3h'] = wx_df['snow_3h'].fillna(0)
wx_df['snow_1h'] = wx_df['snow_1h'].fillna(0)
wx_df['temp'] = wx_df['temp_max']
wx_df['year'] = wx_df['time'].apply(lambda x: x.year)
wx_df['month'] = wx_df['time'].apply(lambda x: x.month)
wx_df['day'] = wx_df['time'].apply(lambda x: x.day)
wx_df['year'] = wx_df['time'].apply(lambda x: x.year)

In [117]:
try:
    wx_df = wx_df.drop(columns=['dt', 
                        'dt_iso', 
                        'timezone', 
                        'city_name', 
                        'lat', 
                        'lon', 
                        'feels_like', 
                        'temp_min', 
                        'temp_max',
                        'pressure',
                        'sea_level',
                        'grnd_level',
                        'humidity',
                        'wind_speed',
                        'wind_deg',
                        'clouds_all',
                        'weather_description',
                        'weather_icon',
                        'weather_id',
                        'weather_main',
                       ], axis=1)
except:
    print('Failed')

Failed


In [120]:
wx_df.describe()

Unnamed: 0,temp,rain_1h,rain_3h,snow_1h,snow_3h,year,month,day
count,55219.0,55219.0,55219.0,55219.0,55219.0,55219.0,55219.0,55219.0
mean,285.518514,0.160237,0.047967,0.013533,0.001809,2017.485087,6.395969,15.782557
std,11.064564,0.862417,0.708266,0.123137,0.066962,1.695629,3.444176,8.817962
min,245.37,0.0,0.0,0.0,0.0,2015.0,1.0,1.0
25%,276.48,0.0,0.0,0.0,0.0,2016.0,3.0,8.0
50%,285.193,0.0,0.0,0.0,0.0,2017.0,6.0,16.0
75%,295.15,0.0,0.0,0.0,0.0,2019.0,9.0,23.0
max,311.48,30.5,35.0,8.4,6.0,2020.0,12.0,31.0


In [121]:
cols = [
        ['temp', 'real'], 
        ['rain_1h', 'real'],
        ['rain_3h', 'real'],
        ['snow_1h', 'real'],
        ['snow_3h', 'real'],
        ['year', 'int'],
        ['month', 'int'],
        ['day', 'int'],

        ]


create_table(c, 'hourly_weather', cols)
print()

#c.execute('DROP TABLE hourly_congestion')  # in case you need to delete the table and overwrite
conn.commit()

CREATE TABLE hourly_weather (temp real rain_1h real rain_3h real snow_1h real snow_3h real year int month int day int );



In [144]:
sql_fetch_tables(c, conn)  # helper function in myfuncs
delete_all_entries(c, conn, 'hourly_weather')
wx_df.to_sql('hourly_weather', conn, if_exists='replace', index = False)


[('cam_startend',), ('cam_locations',), ('intersection_locations',), ('daily_violations',), ('intersection_cams',), ('signal_crashes',), ('hourly_congestion',), ('hourly_weather',), ('congestion_regions',)]


In [145]:
query = c.execute("SELECT temp FROM hourly_weather;").fetchall()
print(query[:5])
print(len(query))

[(267.70799999999997,), (267.926,), (268.098,), (268.157,), (268.121,)]
55219


## Add a table for locations of 

In [161]:
#https://data.cityofchicago.org/resource/t2qc-9pjd.json
    
    # First 2000 results, returned as JSON from API / converted to Python list of
# dictionaries by sodapy.
# data website https://data.cityofchicago.org/resource/85ca-t3if.json

region_data = client.get("t2qc-9pjd", # regional congestion current data
                     limit=1000
                    )

# Convert to pandas DataFrame
region_data = pd.DataFrame.from_records(region_data)  # should only return most recent for each region

In [162]:




region_data.rename(columns={"_region_id": "region_id", 
                            "_west": "west",
                            "_east": 'east',
                            '_south':'south',
                            '_north':'north',
                            '_description':'description',
                           }, inplace=True)


for direction in ['west', 'east', 'south', 'north']:
    region_data[direction] = region_data[direction].apply(np.float)
    
region_data.drop(columns=['current_speed', '_last_updt'], inplace=True)
    
region_data.head(50)
region_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29 entries, 0 to 28
Data columns (total 7 columns):
region         29 non-null object
region_id      29 non-null object
west           29 non-null float64
east           29 non-null float64
south          29 non-null float64
north          29 non-null float64
description    29 non-null object
dtypes: float64(4), object(3)
memory usage: 1.7+ KB


In [141]:
# cols = [
#         ['region', 'text'], 
#         ['region_id', 'text'],
#         ['west', 'real'],
#         ['east', 'real'],
#         ['south', 'real'],
#         ['north', 'real'],
#         ['description', 'text'],
#         ]


# create_table(c, 'congestion_regions', cols)
# print()

# conn.commit()

CREATE TABLE congestion_regions (region text region_id text west real east real south real north real description text );

CREATE TABLE congestion_regions FAILED!! table congestion_regions already exists



In [163]:
sql_fetch_tables(c, conn)  # helper function in myfuncs
delete_all_entries(c, conn, 'congestion_regions')
region_data.to_sql('congestion_regions', conn, if_exists='replace', index = False)

[('cam_startend',), ('cam_locations',), ('intersection_locations',), ('daily_violations',), ('intersection_cams',), ('signal_crashes',), ('hourly_congestion',), ('hourly_weather',), ('congestion_regions',)]


In [166]:
query = c.execute("SELECT * FROM congestion_regions;").fetchall()
print(query[:1])
print(len(query))

[('Lawndale N/S', '14', -87.747456, -87.685372, 41.822792, 41.866129, 'Pershing to Roosevel. Cicero to Western')]
29
