# Imports


In [226]:
import pandas as pd
from sodapy import Socrata


from datetime import datetime
from modules.myfuncs import *

# imported inside modules.myfuncs
#import sqlite3
#from sqlite3 import Error 

# Create my database
 

In [31]:
conn = create_connection('database/rlc.db')  # function from myfuncs file
c = conn.cursor()
#conn.close()

sqlite3 version: 2.6.0
connected to database/rlc.db


## create sodapy client

In [32]:
## referenced example from https://github.com/xmunoz/sodapy/issues/52

client = Socrata("data.cityofchicago.org", None)  # starts up my client object
data = client.get("spqx-js37", #speed cams are at 'hhkd-xvj4' 
                     where='violation_date BETWEEN \'2015-01-01T00:00:00.000\' AND \'2020-12-20T00:00:00.000\'',
                     limit=1000,
                    )





# Make db
## Make table (all violations, index primary key)
Clunky, but I found easiest to go to df, then to db

In [33]:
# Convert to pandas DataFrame
results_df = pd.DataFrame.from_records(data)
results_df.head()

Unnamed: 0,intersection,camera_id,address,violation_date,violations,x_coordinate,y_coordinate,latitude,longitude,location
0,31ST ST AND MARTIN LUTHER KING DRIVE,2121,3100 S DR MARTIN L KING,2015-01-01T00:00:00.000,19,,,,,
1,ASHLAND AND DIVERSEY,1623,1600 W DIVERSEY PARKWA,2015-01-01T00:00:00.000,2,,,,,
2,IRVING PARK AND LARAMIE,1533,5200 W IRVING PARK ROA,2015-01-01T00:00:00.000,2,,,,,
3,ELSTON AND IRVING PARK,1503,3700 W IRVING PARK ROA,2015-01-01T00:00:00.000,2,,,,,
4,IRVING PARK AND KILPATRICK,2764,4700 W IRVING PARK ROA,2015-01-01T00:00:00.000,3,,,,,


In [34]:
# pare down the df to make suitable as db.  Remember only  allowable types TEXT.NUMERIC.INTEGER.REAL.BLOB.
valid_cols = results_df.columns[:9]
results_df = results_df[valid_cols]
results_df = results_df[results_df['camera_id'].notna()]


# coerce my data types
results_df['camera_id'] = results_df['camera_id'].apply(str)
results_df['violations'] = results_df['violations'].apply(str)

results_df['x_coordinate'] = results_df['x_coordinate'].apply(float)
results_df['y_coordinate'] = results_df['y_coordinate'].apply(float)
results_df['latitude'] = results_df['latitude'].apply(float)
results_df['longitude'] = results_df['longitude'].apply(float)
#results_df['location'] = results_df['location'].apply(str)
violation_dates =  pd.to_datetime(results_df['violation_date'])

results_df['month'] = violation_dates.apply(lambda x: str(x.month))
results_df['weekday'] = violation_dates.apply(lambda x: str(datetime.weekday(x)))
results_df['year'] = violation_dates.apply(lambda x: str(x.year))



results_df.head()
results_df.info()



<class 'pandas.core.frame.DataFrame'>
Int64Index: 999 entries, 0 to 999
Data columns (total 12 columns):
intersection      999 non-null object
camera_id         999 non-null object
address           999 non-null object
violation_date    999 non-null object
violations        999 non-null object
x_coordinate      941 non-null float64
y_coordinate      941 non-null float64
latitude          941 non-null float64
longitude         941 non-null float64
month             999 non-null object
weekday           999 non-null object
year              999 non-null object
dtypes: float64(4), object(8)
memory usage: 101.5+ KB


## write data from query to sqlite db

In [222]:

def create_table(c, mytable, cols, d_type, key=None):
    '''
    Creates a new table 
    Sets a constraint on db (UNIQUE) so that if you have camera_id and violation_date the same...
    '''
    # make my columns into sql string compatible
    cols=list(cols)
    #if key in cols: cols.remove(key)
        
        
    cols_str = "("
    for i in range(len(cols)):
        cols_str += cols[i] + ' ' + d_type[i] + ', '
    else:
        cols_str = cols_str[:-2] + ')'
    

    if key:
        #CONSTRAINT constraint_name PRIMARY KEY (pk_col1, pk_col2, ... pk_col_n)
        my_sql = '''CREATE TABLE {} {};'''.format(mytable, cols_str)  # insert datatypes for cols
        my_sql.replace(key, key + 'PRIMARY KEY')
    else:
        my_sql = "CREATE TABLE {} {};".format(mytable, cols_str)  # insert datatypes for cols

    print(my_sql) # just to see what I'm doing

    
    try:     
        c.execute(my_sql) 
    except Exception as e:
        print('fail', e)
        
        
def set_unique(mytable, cols):
    sql = '''CREATE UNIQUE INDEX idx ON {}'''.format(mytable)
    add_this = ''
    for x in cols:
        add_this += x + ", " 
        
    add_this = '(' + add_this[:-2] + ');'
    print(add_this)
    sql += add_this
    print("THIS")
    print(sql)
    
    c.execute(sql)
    #c.execute('''CREATE UNIQUE INDEX idx ON {}(camera_id, violation_date);'''.format(mytable))

def insert_violation(c, col_names, new_data):
    """
    Create a new task
    :param conn:
    :param new_data:
    :return:
    """
    vals = ('('+'?,'*len(new_data))[:-1]+')'  # (?,?,?...)
    #print(vals)
    cols = str(tuple([x for x in col_names])).replace(' ', '').replace("'", "").replace(',', ', ')
    
    sql = '''INSERT OR IGNORE INTO red_light_violations{}VALUES{};'''.format(cols, vals)
    # example from sql site 
    # sql = ''' INSERT INTO tasks(name,priority,status_id,project_id,begin_date,end_date)VALUES(?,?,?,?,?,?) '''
    
    #print(tuple(new_data))
    c.execute(sql, new_data)
    

    return c.lastrowid


d_types = ['text', # intersection
         'int', # camera_id
         'text', # address
         'numeric', # violation_date
         'int', # violations
         'real', # x_coord
         'real', # y_coord
         'real', # latitude
         'real', # longitude
         'int', # month
         'int', # week
         'int', # year
          ]

##### only do this once ####
create_table(c, 'red_light_violations', results_df.columns, d_types)
set_unique('red_light_violations', ('camera_id', 'violation_date'))

conn.commit()

for i in range(len(results_df)):
    insert_violation(c, results_df.columns, list(results_df.iloc[i, :]))

conn.commit()

#c.execute('CREATE TABLE CARS (Brand text, Price number)')   # this is example from sqlite website
#c.execute(create_text)
#conn.commit()

CREATE TABLE red_light_violations (intersection text, camera_id int, address text, violation_date numeric, violations int, x_coordinate real, y_coordinate real, latitude real, longitude real, month int, weekday int, year int);
fail table red_light_violations already exists
(camera_id, violation_date);
THIS
CREATE UNIQUE INDEX idx ON red_light_violations(camera_id, violation_date);


IntegrityError: UNIQUE constraint failed: red_light_violations.camera_id, red_light_violations.violation_date

In [183]:
len(results_df.columns)
results_df.iloc[1, :]

intersection         ASHLAND AND DIVERSEY
camera_id                            1623
address            1600 W DIVERSEY PARKWA
violation_date    2015-01-01T00:00:00.000
violations                              2
x_coordinate                          NaN
y_coordinate                          NaN
latitude                              NaN
longitude                             NaN
month                                   1
weekday                                 3
year                                 2015
Name: 1, dtype: object

In [184]:
def sql_fetch_tables(c):
    c.execute('SELECT name from sqlite_master where type= "table"')
    print(c.fetchall())

sql_fetch_tables(c)  # helper function in myfuncs

[('red_light_violations',)]


## Now let's see if our write to db worked as expected

In [185]:
# definitely have some INTEGER issue.  Had to change to text.  That's no good.
c.execute("SELECT intersection, camera_id FROM red_light_violations;").fetchall()[:10]



[('31ST ST AND MARTIN LUTHER KING DRIVE', 2121),
 ('ASHLAND AND DIVERSEY', 1623),
 ('IRVING PARK AND LARAMIE', 1533),
 ('ELSTON AND IRVING PARK', 1503),
 ('IRVING PARK AND KILPATRICK', 2764),
 ('IRVING PARK AND CALIFORNIA', 1234),
 ('HALSTED AND 103RD', 2592),
 ('JEFFERY AND 79TH', 2662),
 ('IRVING PARK AND KILPATRICK', 2763),
 ('VAN BUREN AND WESTERN', 2054)]

In [186]:
#conn.close()

# Make table for all cameras (camera_id as primary key)
First let's isolate the info we need for this db.  
Each entry will be a different camera

In [187]:
# I already did this work to pull out the relavant data


In [188]:
cam_df = results_df.copy()
cam_df = cam_df.drop(['violation_date', 'violations', 'month', 'weekday', 'year'], axis=1)

cam_df = cam_df.groupby(['camera_id']).max().reset_index()


In [189]:
len(cam_df.camera_id.unique())

341

In [204]:
def insert_cam(c, col_names, new_data):
    """
    Create a new task
    :param conn:
    :param new_data:
    :return:
    """
    vals = ('('+'?,'*len(new_data))[:-1]+')'  # (?,?,?...)
    #print(vals)
    cols = str(tuple([x for x in col_names])).replace(' ', '').replace("'", "").replace(',', ', ')
    
    sql = '''INSERT OR IGNORE INTO red_light_cameras{}VALUES{};'''.format(cols, vals)
    # example from sql site 
    # sql = ''' INSERT INTO tasks(name,priority,status_id,project_id,begin_date,end_date)VALUES(?,?,?,?,?,?) '''
    
    #print(tuple(new_data))
    c.execute(sql, new_data)
    

    return c.lastrowid


d_types = [
         'int', # camera_id
         'text', # intersection
         'text', # address
         'real', # x_coord
         'real', # y_coord
         'real', # latitude
         'real', # longitude
          ]


##### only do this once ####
create_table(c, 'red_light_cameras', cam_df.columns, d_types, key='camera_id')
conn.commit()

for i in range(len(cam_df)):
    insert_cam(c, cam_df.columns, list(cam_df.iloc[i, :]))

conn.commit()

#c.execute('CREATE TABLE CARS (Brand text, Price number)')   # this is example from sqlite website
#c.execute(create_text)
#conn.commit()

CREATE TABLE red_light_cameras (camera_id int, intersection text, address text, x_coordinate real, y_coordinate real, latitude real, longitude real);
fail table red_light_cameras already exists


In [205]:
cam_df.info()

c.execute("PRAGMA table_info('red_light_cameras')")
print(c.fetchall())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 341 entries, 0 to 340
Data columns (total 7 columns):
camera_id       341 non-null object
intersection    341 non-null object
address         341 non-null object
x_coordinate    323 non-null float64
y_coordinate    323 non-null float64
latitude        323 non-null float64
longitude       323 non-null float64
dtypes: float64(4), object(3)
memory usage: 18.8+ KB
[(0, 'camera_id', 'int', 0, None, 0), (1, 'intersection', 'text', 0, None, 0), (2, 'address', 'text', 0, None, 0), (3, 'x_coordinate', 'real', 0, None, 0), (4, 'y_coordinate', 'real', 0, None, 0), (5, 'latitude', 'real', 0, None, 0), (6, 'longitude', 'real', 0, None, 0)]


In [206]:
# definitely have some INTEGER issue.  Had to change to text.  That's no good.
cam_df[cam_df['intersection']=='WESTERN AND CERMAK']

c.execute('''SELECT camera_id 
             FROM red_light_cameras
             WHERE intersection=='WESTERN AND CERMAK'
             ;''').fetchall()[:10]

[(1002,), (1003,), (1002,), (1003,), (1002,), (1003,)]

In [None]:

print('''\nCREATE UNIQUE INDEX idx ON {}(camera_id, violation_date);'''.format('red_light_cameras', )) #just to see it!
