In [1]:
import pandas as pd
import numpy as np

from datetime import datetime
import sqlite3
import itertools

In [2]:
df = pd.read_csv('data/Crash_Reporting_-_Drivers_Data.csv', low_memory=False)

cols_to_drop = ['Report Number', 'Local Case Number','Latitude','Longitude',
                'Off-Road Description', 'Municipality','Related Non-Motorist',
                'Non-Motorist Substance Abuse', 'Circumstance']

df.drop(cols_to_drop, axis=1, inplace=True)

d = {'Agency Name': str,
 'ACRS Report Type': str,
 'Crash Date/Time': 'datetime64[ns]',
 'Route Type': str,
 'Road Name': 'ignore',
 'Cross-Street Type': str,
 'Cross-Street Name': 'ignore',
 'Collision Type': str,
 'Weather': str,
 'Surface Condition': str,
 'Light': str,
 'Traffic Control': str,
 'Driver Substance Abuse': str,
 'Person ID': 'ignore',
 'Driver At Fault': str,
 'Injury Severity': str,
 'Driver Distracted By': str,
 'Drivers License State': str,
 'Vehicle ID': 'ignore',
 'Vehicle Damage Extent': str,
 'Vehicle First Impact Location': str,
 'Vehicle Second Impact Location': str,
 'Vehicle Body Type': str,
 'Vehicle Movement': str,
 'Vehicle Continuing Dir': str,
 'Vehicle Going Dir': str,
 'Speed Limit': float,
 'Driverless Vehicle': 'ignore',
 'Parked Vehicle': 'ignore',
 'Vehicle Year': float,
 'Vehicle Make': str,
 'Vehicle Model': str,
 'Equipment Problems': 'ignore',
 'Location': str}

In [3]:
for key, value in d.items():
    if value != 'ignore':
        df[key] = df[key].astype(value)
    else:
        df.drop(key, axis=1, inplace=True)

df['Crash Date'] = df['Crash Date/Time'].dt.date
df['Crash Time'] = df['Crash Date/Time'].dt.time
df.drop('Crash Date/Time', axis=1, inplace=True)

df["Driver At Fault"].replace({"Yes": 1, "No": 0, "Unknown": 0}, inplace=True)
df.loc[df["Vehicle Year"]==0.0, "Vehicle Year"] = np.nan

df.columns = [col.replace(" ", "_").lower() for col in df.columns]

### SQL Queries

In [4]:
# Connect to an in-memory SQLite database
conn = sqlite3.connect(':memory:')
# Insert the Pandas DataFrame into the SQLite database
df.to_sql('crash_data', conn, index=False)
num_functions = ['avg', 'min', 'max', 'sum']

# only numerical
for func in num_functions:
    for col in df.select_dtypes(include=['float','int']).columns:
        query = f"SELECT {func}({col}) as {col} from crash_data"
        print(f"what is the {func} {col}: {pd.read_sql_query(query, conn).iat[0, 0]}")

what is the avg driver_at_fault: 0.5371414533713912
what is the avg speed_limit: 32.55188105376445
what is the avg vehicle_year: 2013.0279753778352
what is the min driver_at_fault: 0
what is the min speed_limit: 0.0
what is the min vehicle_year: 1.0
what is the max driver_at_fault: 1
what is the max speed_limit: 75.0
what is the max vehicle_year: 9999.0
what is the sum driver_at_fault: 91834
what is the sum speed_limit: 5565330.0
what is the sum vehicle_year: 336183724.0


In [7]:
cat_cols = ["vehicle_make", "injury_severity", "vehicle_continuing_dir"]
num_cols = ["speed_limit", "vehicle_year"]
date_cols = ["crash_date"]
bin_cols = ["driver_at_fault"]

df[bin_cols].head()

Unnamed: 0,driver_at_fault
0,1
1,0
2,1
3,1
4,1


In [None]:
"""
categorical: by category level
numerical: by average, max, min and total
date: by year, by month
binary: when present or not (x 2) for every binary

#2 dim
num with num        : what is avg vehicle year and avg speed limit
num with binary     : what is avg speed limit when driver was (at fault / not at fault)
cat with binary     : what is injury severity when driver was at fault
cat with cat        : what is injury severity by vehicle make
num with cat        : what is avg speed limit by injury severity
num with date       : what is avg speed limit by month/year
cat with date       : what is vehicle make by month
binary with date    : how many times driver was at fault by year

#3 give some thought about 3 dimensions (limit)
#4 check with some other clean datasets

"""

In [9]:
list(itertools.product(num_cols, bin_cols))

[('speed_limit', 'driver_at_fault'), ('vehicle_year', 'driver_at_fault')]

In [None]:
# what is the average speed limit
col = "speed_limit"
query = f"SELECT avg({col}) as average_speed from crash_data"
pd.read_sql_query(query, conn).iat[0, 0]

In [None]:
combinations = list(itertools.combinations(num_cols, 2))
num_functions = ['min', 'max', 'avg', 'total']
all_perm = list(itertools.permutations(num_functions, 2))
all_perm.extend([(f, f) for f in num_functions])

In [None]:
from copy import copy
combinations = list(itertools.combinations(num_cols, 2))

num_functions = ['min', 'max', 'avg', 'total']
all_perm = list(itertools.permutations(num_functions, 2))
full_perm = copy(all_perm)
full_perm.extend([(f, f) for f in num_functions])

count = 0
# for just one numerical feature
if len(num_cols) == 1:
    for f1, f2 in all_perm:
        for c in num_cols:
            query = f"SELECT {f1}({c}) as {f1}_{c}, {f2}({c}) as {f2}_{c} from crash_data"
            print(f"what is {f1} {c} and {f2} {c}: {pd.read_sql_query(query, conn)}")
            count+=1

else:
    # numerical with numerical
    for f1, f2 in all_perm:
        for c1, c2 in combinations:
            query1 = f"SELECT {f1}({c1}) as {f1}_{c1}, {f2}({c1}) as {f2}_{c1} from crash_data"
            print(f"what is {f1} {c1} and {f2} {c1}: {pd.read_sql_query(query1, conn)}")
            query2 = f"SELECT {f1}({c2}) as {f1}_{c2}, {f2}({c2}) as {f2}_{c2} from crash_data"
            print(f"what is {f1} {c2} and {f2} {c2}: {pd.read_sql_query(query2, conn)}")
            query3 = f"SELECT {f1}({c1}) as {f1}_{c1}, {f2}({c2}) as {f2}_{c2} from crash_data"
            print(f"what is {f1} {c1} and {f2} {c2}: {pd.read_sql_query(query3, conn)}")
            query4 = f"SELECT {f1}({c2}) as {f1}_{c2}, {f2}({c1}) as {f2}_{c1} from crash_data"
            print(f"what is {f1} {c2} and {f2} {c1}: {pd.read_sql_query(query4, conn)}")
            count+=4

# numerical with binary
for func in num_functions:
    for col in num_cols:
        for cond in bin_cols:
            print(f"what is {func} {col} when {cond}")
            print(f"what is {func} {col} when not {cond}")
            count+=2

# categorical with binary
for col in cat_cols:
    for cond in bin_cols:
        print(f"what {col} has most counts when {cond}")
        print(f"what {col} has least counts when {cond}")
        for n in range(1, 3):
            print(f"what are the top {n+1} {col} when {cond}")
        count+=4

# categorical with categorical
for i in cat_cols:
    for j in cat_cols:
        if j!= i:
            print(f"what is the most {j} by {i}")
            print(f"what is the least {j} by {i}")
            count+=2

# numerical with categorical
for func in num_functions:
    for num_col in num_cols:
        for cat_col in cat_cols:
            print(f"what is {func} {num_col} by {cat_col}")
            count+=1

dates = ['month', 'year']
# numerical with date
for func in num_functions:
    for num_col in num_cols:
        for date_col in date_cols:
            print(f"what is {func} {num_col} by {date_col} {dates[0]}")
            print(f"what is {func} {num_col} by {date_col} {dates[1]}")
            print(f"what is {func} {num_col} by {date_col} {dates[0]} and {dates[1]}")
            count+=3

# categorical with date
for cat_col in cat_cols:
    for date_col in date_cols:
        print(f"what is most {cat_col} by {date_col} {dates[0]}")
        print(f"what is least {cat_col} by {date_col} {dates[1]}")
        print(f"what is most {cat_col} by {date_col} {dates[0]} and {dates[1]}")
        print(f"what is most {cat_col} by {date_col} {dates[0]}")
        print(f"what is least {cat_col} by {date_col} {dates[1]}")
        print(f"what is least {cat_col} by {date_col} {dates[0]} and {dates[1]}")
        count+=6
        for n in range(1, 3):
            print(f"what are the top {n+1} {cat_col} by {dates[0]}")
            print(f"what are the top {n+1} {cat_col} by {dates[1]}")
            count+=2

# date with binary
for col in bin_cols:
    print(f"which {dates[0]} has most {j}")
    print(f"which {dates[0]} has least {j}")
    print(f"which {dates[1]} has most {j}")
    print(f"which {dates[1]} has least {j}")
    print(f"which {dates[0]} and {dates[1]} has most {j}")
    print(f"which {dates[0]} and {dates[1]} has least {j}")
    count+=6

In [None]:
count

In [None]:
df['month_year'] = df[""]

In [None]:
num_functions

In [None]:
for cat_col in cat_cols:
    for date_col in date_cols:
        print(f"what is most {cat_col} by {date_col} {dates[0]}")
        print(f"what is least {cat_col} by {date_col} {dates[1]}")
        print(f"what is most {cat_col} by {date_col} {dates[0]} and {dates[1]}")
        print(f"what is most {cat_col} by {date_col} {dates[0]}")
        print(f"what is least {cat_col} by {date_col} {dates[1]}")
        print(f"what is least {cat_col} by {date_col} {dates[0]} and {dates[1]}")
        for n in range(1, 3):
            print(f"what are the top {n+1} {cat_col} by {dates[0]}")
            print(f"what are the top {n+1} {cat_col} by {dates[1]}")

In [None]:
# Karan (make 3 cells with dynamic for loop covering all the questions):
# cat with binary     : what is injury severity when driver was at fault
# num with cat        : what is avg speed limit by injury severity
#
# Shantanu (make 4 cells with dynamic for loop covering all the questions)::
# num with date       : what is avg speed limit by month/year
# cat with cat        : what is injury severity by vehicle make
# cat with date       : what is vehicle make by month
# binary with date    : how many times driver was at fault by year

In [None]:
cat_cols, bin_cols

In [None]:
df["vehicle_continuing_dir"].value_counts()

In [None]:
# What vehicle_make has most counts when driver at fault
# What was the most injury_severity when driver at fault
# What was the vehicle_continuing_dir when driver at fault

# What vehicle_make has most counts when driver at no fault
# What was the most injury_severity when driver at no fault
# What was the vehicle_continuing_dir when driver at no fault

# What are the top two vehicle_make when driver at fault
# What was the top two injury_severity when driver at fault
# What are the top two vehicle_make when driver at no fault
# What are the top two injury_severity when driver at no fault

In [None]:
"""
1. generated question based on pandas dataframe (done)
2. generated sql queries based on #1
3. canonical question based on #1 (chatGPT) - 3x of #1
4. LLM Model to convert canonical question to best generated question

input = paraphrased question
output = best generated question (w/ cosine similarity and w/ LLM) (from #4)
result = scores between cosine and LLM (metric)
"""

In [None]:
num_functions

In [None]:
what is avg and min speed
what is avg and min year
what is avg year and min speed
what is avg speed and min year
what is avg and min speed
what is avg and min cost
what is avg cost and min speed
what is avg speed and min cost
what is avg and min year
what is avg and min cost
what is avg cost and min year
what is avg year and min cost
what is avg and max speed
what is avg and max year
what is avg year and max speed
what is avg speed and max year
what is avg and max speed
what is avg and max cost
what is avg cost and max speed
what is avg speed and max cost
what is avg and max year
what is avg and max cost
what is avg cost and max year
what is avg year and max cost
what is avg and sum speed
what is avg and sum year
what is avg year and sum speed
what is avg speed and sum year
what is avg and sum speed
what is avg and sum cost
what is avg cost and sum speed
what is avg speed and sum cost
what is avg and sum year
what is avg and sum cost
what is avg cost and sum year
what is avg year and sum cost
what is min and avg speed
what is min and avg year
what is min year and avg speed
what is min speed and avg year
what is min and avg speed
what is min and avg cost
what is min cost and avg speed
what is min speed and avg cost
what is min and avg year
what is min and avg cost
what is min cost and avg year
what is min year and avg cost
what is min and max speed
what is min and max year
what is min year and max speed
what is min speed and max year
what is min and max speed
what is min and max cost
what is min cost and max speed
what is min speed and max cost
what is min and max year
what is min and max cost
what is min cost and max year
what is min year and max cost
what is min and sum speed
what is min and sum year
what is min year and sum speed
what is min speed and sum year
what is min and sum speed
what is min and sum cost
what is min cost and sum speed
what is min speed and sum cost
what is min and sum year
what is min and sum cost
what is min cost and sum year
what is min year and sum cost
what is max and avg speed
what is max and avg year
what is max year and avg speed
what is max speed and avg year
what is max and avg speed
what is max and avg cost
what is max cost and avg speed
what is max speed and avg cost
what is max and avg year
what is max and avg cost
what is max cost and avg year
what is max year and avg cost
what is max and min speed
what is max and min year
what is max year and min speed
what is max speed and min year
what is max and min speed
what is max and min cost
what is max cost and min speed
what is max speed and min cost
what is max and min year
what is max and min cost
what is max cost and min year
what is max year and min cost
what is max and sum speed
what is max and sum year
what is max year and sum speed
what is max speed and sum year
what is max and sum speed
what is max and sum cost
what is max cost and sum speed
what is max speed and sum cost
what is max and sum year
what is max and sum cost
what is max cost and sum year
what is max year and sum cost
what is sum and avg speed
what is sum and avg year
what is sum year and avg speed
what is sum speed and avg year
what is sum and avg speed
what is sum and avg cost
what is sum cost and avg speed
what is sum speed and avg cost
what is sum and avg year
what is sum and avg cost
what is sum cost and avg year
what is sum year and avg cost
what is sum and min speed
what is sum and min year
what is sum year and min speed
what is sum speed and min year
what is sum and min speed
what is sum and min cost
what is sum cost and min speed
what is sum speed and min cost
what is sum and min year
what is sum and min cost
what is sum cost and min year
what is sum year and min cost
what is sum and max speed
what is sum and max year
what is sum year and max speed
what is sum speed and max year
what is sum and max speed
what is sum and max cost
what is sum cost and max speed
what is sum speed and max cost
what is sum and max year
what is sum and max cost
what is sum cost and max year
what is sum year and max cost
what is avg and avg speed
what is avg and avg year
what is avg year and avg speed
what is avg speed and avg year
what is avg and avg speed
what is avg and avg cost
what is avg cost and avg speed
what is avg speed and avg cost
what is avg and avg year
what is avg and avg cost
what is avg cost and avg year
what is avg year and avg cost
what is min and min speed
what is min and min year
what is min year and min speed
what is min speed and min year
what is min and min speed
what is min and min cost
what is min cost and min speed
what is min speed and min cost
what is min and min year
what is min and min cost
what is min cost and min year
what is min year and min cost
what is max and max speed
what is max and max year
what is max year and max speed
what is max speed and max year
what is max and max speed
what is max and max cost
what is max cost and max speed
what is max speed and max cost
what is max and max year
what is max and max cost
what is max cost and max year
what is max year and max cost
what is sum and sum speed
what is sum and sum year
what is sum year and sum speed
what is sum speed and sum year
what is sum and sum speed
what is sum and sum cost
what is sum cost and sum speed
what is sum speed and sum cost
what is sum and sum year
what is sum and sum cost
what is sum cost and sum year
what is sum year and sum cost
​