In [1]:
import pandas as pd
import numpy as np

from datetime import datetime
import sqlite3
import itertools

In [4]:
df = pd.read_csv('data/Crash_Reporting_-_Drivers_Data.csv', low_memory=False)

cols_to_drop = ['Report Number', 'Local Case Number','Latitude','Longitude',
                'Off-Road Description', 'Municipality','Related Non-Motorist',
                'Non-Motorist Substance Abuse', 'Circumstance']

df.drop(cols_to_drop, axis=1, inplace=True)

In [5]:
col_datatypes = {}

for i in df.columns:
    col_datatypes[i] = type(i)

In [6]:
d = {'Agency Name': str,
 'ACRS Report Type': str,
 'Crash Date/Time': 'datetime64[ns]',
 'Route Type': str,
 'Road Name': 'ignore',
 'Cross-Street Type': str,
 'Cross-Street Name': 'ignore',
 'Collision Type': str,
 'Weather': str,
 'Surface Condition': str,
 'Light': str,
 'Traffic Control': str,
 'Driver Substance Abuse': str,
 'Person ID': 'ignore',
 'Driver At Fault': str,
 'Injury Severity': str,
 'Driver Distracted By': str,
 'Drivers License State': str,
 'Vehicle ID': 'ignore',
 'Vehicle Damage Extent': str,
 'Vehicle First Impact Location': str,
 'Vehicle Second Impact Location': str,
 'Vehicle Body Type': str,
 'Vehicle Movement': str,
 'Vehicle Continuing Dir': str,
 'Vehicle Going Dir': str,
 'Speed Limit': float,
 'Driverless Vehicle': 'ignore',
 'Parked Vehicle': 'ignore',
 'Vehicle Year': float,
 'Vehicle Make': str,
 'Vehicle Model': str,
 'Equipment Problems': 'ignore',
 'Location': str}

In [7]:
for key, value in d.items():
    if value != 'ignore':
        df[key] = df[key].astype(value)
    else:
        df.drop(key, axis=1, inplace=True)

df['Speed_Limit'] = df['Speed Limit']
df['Crash Date'] = df['Crash Date/Time'].dt.date
df['Crash Time'] = df['Crash Date/Time'].dt.time
df.drop('Crash Date/Time', axis=1, inplace=True)

In [37]:
df["Injury Severity"].value_counts()

NO APPARENT INJURY          137563
POSSIBLE INJURY              17045
SUSPECTED MINOR INJURY       11654
SUSPECTED SERIOUS INJURY      1386
FATAL INJURY                   151
Name: Injury Severity, dtype: int64

In [40]:
df["Driver At Fault"].replace({"Yes": 1, "No": 0, "Unknown": 0}, inplace=True)

In [45]:
df.columns = [col.replace(" ", "_").lower() for col in df.columns]

### SQL Queries

In [14]:
# Connect to an in-memory SQLite database
conn = sqlite3.connect(':memory:')

# Insert the Pandas DataFrame into the SQLite database
df.to_sql('crash_data', conn, index=False)

167799

In [13]:
query = "SELECT Speed_Limit from crash_data limit 10"
result = pd.read_sql_query(query, conn)
print(result)

   Speed_Limit
0         15.0
1          0.0
2         35.0
3         45.0
4         25.0
5         35.0
6         30.0
7         40.0
8         35.0
9         40.0


### 1. What is the sum, average, count of a given column

In [46]:
# what is the average speed limit
col = "speed_limit"
query = f"SELECT avg({col}) as average_speed from crash_data"
pd.read_sql_query(query,conn)

Unnamed: 0,average_speed
0,32.568728


In [None]:
num_functions = ['avg', 'min', 'max', 'sum']

for func in num_functions:
    for col in df.select_dtypes(include=['float','int']).columns:
        print(f"what is the {func} {col}")

In [44]:
df["Crash_Time"]

0         15:00:00
1         17:00:00
2         10:45:00
3         23:40:00
4         17:40:00
            ...   
167794    10:01:00
167795    14:22:00
167796    07:37:00
167797    23:23:00
167798    09:02:00
Name: Crash_Time, Length: 167799, dtype: object

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 167799 entries, 0 to 167798
Data columns (total 31 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   Agency Name                     167799 non-null  object 
 1   ACRS Report Type                167799 non-null  object 
 2   Route Type                      167799 non-null  object 
 3   Road Name                       167799 non-null  object 
 4   Cross-Street Type               167799 non-null  object 
 5   Cross-Street Name               167799 non-null  object 
 6   Collision Type                  167799 non-null  object 
 7   Weather                         167799 non-null  object 
 8   Surface Condition               167799 non-null  object 
 9   Light                           167799 non-null  object 
 10  Traffic Control                 167799 non-null  object 
 11  Driver Substance Abuse          167799 non-null  object 
 12  Driver At Fault 

In [69]:
cat_cols = ["vehicle_make", "injury_severity", "vehicle_continuing_dir"]
num_cols = ["speed_limit", "vehicle_year"]
date_cols = ["crash_date"]
bin_cols = ["driver_at_fault"]

"""
categorical: by category level
numerical: by average, max, min and total
date: by year, by month
binary: when present or not (x 2) for every binary

#2 dim
num with num        : what is avg vehicle year and avg speed limit

num with binary     : what is avg speed limit when driver was (at fault / not at fault)
num with cat        : what is avg speed limit by injury severity
num with date       : what is avg speed limit by month/year
cat with cat        : what is injury severity by vehicle make
cat with binary     : what is injury severity when driver was at fault
cat with date       : what is vehicle make by month
binary with date    : how many times driver was at fault by year

"""

for f in num_functions:
    for num in num_cols:
        print(f"what is {f} ")

'\ncategorical: by category level\nnumerical: by average, max, min and total\ndate: by year, by month\nbinary: when present or not (x 2) for every binary\n\n#2 dim\nnum with num        : what is avg vehicle year and avg speed limit\nnum with binary     : what is avg speed limit when driver was at fault\nnum with cat        : what is avg speed limit by injury severity\nnum with date       : what is avg speed limit by month/year\ncat with cat        : what is injury severity by vehicle make\ncat with binary     : what is injury severity when driver was at fault\ncat with date       : what is vehicle make by month\nbinary with date    : how many times driver was at fault by year\n\n'

In [73]:
list(itertools.product(num_cols, bin_cols))

[('speed_limit', 'driver_at_fault'), ('vehicle_year', 'driver_at_fault')]

In [70]:
combinations = list(itertools.combinations(num_cols, 2))
num_functions = ['min', 'max', 'avg', 'total']
all_perm = list(itertools.permutations(num_functions, 2))
all_perm.extend([(f, f) for f in num_functions])

In [71]:
from copy import copy
combinations = list(itertools.combinations(num_cols, 2))

num_functions = ['min', 'max', 'avg', 'total']
all_perm = list(itertools.permutations(num_functions, 2))
full_perm = copy(all_perm)
full_perm.extend([(f, f) for f in num_functions])

# for just one numerical feature
if len(num_cols) == 1:
    for f1, f2 in all_perm:
        for c in num_cols:
            print(f"what is {f1} {c} and {f2} {c}")
            
else:
    for f1, f2 in full_perm:
        for c1, c2 in combinations:
            print(f"what is {f1} {c1} and {f2} {c1}")
            print(f"what is {f1} {c2} and {f2} {c2}")
            print(f"what is {f1} {c1} and {f2} {c2}")
            print(f"what is {f1} {c2} and {f2} {c1}")

what is min speed_limit and max speed_limit
what is min vehicle_year and max vehicle_year
what is min speed_limit and max vehicle_year
what is min vehicle_year and max speed_limit
what is min speed_limit and avg speed_limit
what is min vehicle_year and avg vehicle_year
what is min speed_limit and avg vehicle_year
what is min vehicle_year and avg speed_limit
what is min speed_limit and total speed_limit
what is min vehicle_year and total vehicle_year
what is min speed_limit and total vehicle_year
what is min vehicle_year and total speed_limit
what is max speed_limit and min speed_limit
what is max vehicle_year and min vehicle_year
what is max speed_limit and min vehicle_year
what is max vehicle_year and min speed_limit
what is max speed_limit and avg speed_limit
what is max vehicle_year and avg vehicle_year
what is max speed_limit and avg vehicle_year
what is max vehicle_year and avg speed_limit
what is max speed_limit and total speed_limit
what is max vehicle_year and total vehicle_yea

In [65]:
len(all_perm), len(combinations)

(16, 0)

In [None]:
what is avg and min speed
what is avg and min year
what is avg year and min speed
what is avg speed and min year
what is avg and min speed
what is avg and min cost
what is avg cost and min speed
what is avg speed and min cost
what is avg and min year
what is avg and min cost
what is avg cost and min year
what is avg year and min cost
what is avg and max speed
what is avg and max year
what is avg year and max speed
what is avg speed and max year
what is avg and max speed
what is avg and max cost
what is avg cost and max speed
what is avg speed and max cost
what is avg and max year
what is avg and max cost
what is avg cost and max year
what is avg year and max cost
what is avg and sum speed
what is avg and sum year
what is avg year and sum speed
what is avg speed and sum year
what is avg and sum speed
what is avg and sum cost
what is avg cost and sum speed
what is avg speed and sum cost
what is avg and sum year
what is avg and sum cost
what is avg cost and sum year
what is avg year and sum cost
what is min and avg speed
what is min and avg year
what is min year and avg speed
what is min speed and avg year
what is min and avg speed
what is min and avg cost
what is min cost and avg speed
what is min speed and avg cost
what is min and avg year
what is min and avg cost
what is min cost and avg year
what is min year and avg cost
what is min and max speed
what is min and max year
what is min year and max speed
what is min speed and max year
what is min and max speed
what is min and max cost
what is min cost and max speed
what is min speed and max cost
what is min and max year
what is min and max cost
what is min cost and max year
what is min year and max cost
what is min and sum speed
what is min and sum year
what is min year and sum speed
what is min speed and sum year
what is min and sum speed
what is min and sum cost
what is min cost and sum speed
what is min speed and sum cost
what is min and sum year
what is min and sum cost
what is min cost and sum year
what is min year and sum cost
what is max and avg speed
what is max and avg year
what is max year and avg speed
what is max speed and avg year
what is max and avg speed
what is max and avg cost
what is max cost and avg speed
what is max speed and avg cost
what is max and avg year
what is max and avg cost
what is max cost and avg year
what is max year and avg cost
what is max and min speed
what is max and min year
what is max year and min speed
what is max speed and min year
what is max and min speed
what is max and min cost
what is max cost and min speed
what is max speed and min cost
what is max and min year
what is max and min cost
what is max cost and min year
what is max year and min cost
what is max and sum speed
what is max and sum year
what is max year and sum speed
what is max speed and sum year
what is max and sum speed
what is max and sum cost
what is max cost and sum speed
what is max speed and sum cost
what is max and sum year
what is max and sum cost
what is max cost and sum year
what is max year and sum cost
what is sum and avg speed
what is sum and avg year
what is sum year and avg speed
what is sum speed and avg year
what is sum and avg speed
what is sum and avg cost
what is sum cost and avg speed
what is sum speed and avg cost
what is sum and avg year
what is sum and avg cost
what is sum cost and avg year
what is sum year and avg cost
what is sum and min speed
what is sum and min year
what is sum year and min speed
what is sum speed and min year
what is sum and min speed
what is sum and min cost
what is sum cost and min speed
what is sum speed and min cost
what is sum and min year
what is sum and min cost
what is sum cost and min year
what is sum year and min cost
what is sum and max speed
what is sum and max year
what is sum year and max speed
what is sum speed and max year
what is sum and max speed
what is sum and max cost
what is sum cost and max speed
what is sum speed and max cost
what is sum and max year
what is sum and max cost
what is sum cost and max year
what is sum year and max cost
what is avg and avg speed
what is avg and avg year
what is avg year and avg speed
what is avg speed and avg year
what is avg and avg speed
what is avg and avg cost
what is avg cost and avg speed
what is avg speed and avg cost
what is avg and avg year
what is avg and avg cost
what is avg cost and avg year
what is avg year and avg cost
what is min and min speed
what is min and min year
what is min year and min speed
what is min speed and min year
what is min and min speed
what is min and min cost
what is min cost and min speed
what is min speed and min cost
what is min and min year
what is min and min cost
what is min cost and min year
what is min year and min cost
what is max and max speed
what is max and max year
what is max year and max speed
what is max speed and max year
what is max and max speed
what is max and max cost
what is max cost and max speed
what is max speed and max cost
what is max and max year
what is max and max cost
what is max cost and max year
what is max year and max cost
what is sum and sum speed
what is sum and sum year
what is sum year and sum speed
what is sum speed and sum year
what is sum and sum speed
what is sum and sum cost
what is sum cost and sum speed
what is sum speed and sum cost
what is sum and sum year
what is sum and sum cost
what is sum cost and sum year
what is sum year and sum cost
​