In [1]:
import pandas as pd
import numpy as np
import scipy.stats as ss
from datetime import datetime

In [30]:
data = pd.read_csv('../data/motor_vehicle_collisions_crashes.csv', low_memory=False)

In [3]:
def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x,y)
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2/n
    r,k = confusion_matrix.shape
    phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1))
    rcorr = r-((r-1)**2)/(n-1)
    kcorr = k-((k-1)**2)/(n-1)
    return np.sqrt(phi2corr/min((kcorr-1),(rcorr-1)))

In [9]:
cramers_v(data['CONTRIBUTING FACTOR VEHICLE 1'], data['CONTRIBUTING FACTOR VEHICLE 2'])

0.22193915933681677

In [26]:
cramers_v(data['CRASH TIME'], data['LOCATION'])

0.0

In [8]:
cramers_v(data['CONTRIBUTING FACTOR VEHICLE 1'], data['VEHICLE TYPE CODE 1'])

0.07817752407851158

In [38]:
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from mlxtend.preprocessing import TransactionEncoder

In [47]:
basket = data[:1000].reset_index()[['CRASH TIME', 'ON STREET NAME']].astype(str).values.tolist()
te = TransactionEncoder()
te_ary = te.fit(basket).transform(basket)
df = pd.DataFrame(te_ary, columns=te.columns_)


In [53]:
frequent_itemsets = apriori(df, min_support=0.01, use_colnames=True)
frequent_itemsets
# rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.6)

Unnamed: 0,support,itemsets
0,0.025,(0:00)
1,0.01,(11:00)
2,0.012,(12:00)
3,0.011,(14:30)
4,0.01,(16:00)
5,0.011,(17:00)
6,0.013,(18:00)
7,0.014,(20:00)
8,0.016,(BELT PARKWAY)
9,0.012,(FDR DRIVE)


[[['5:32',
   '21:35',
   '16:15',
   '16:00',
   '8:25',
   '17:11',
   '17:30',
   '23:30',
   '17:00',
   '21:15',
   '21:06',
   '20:00',
   '11:15',
   '22:50',
   '15:49',
   '14:50',
   '11:00',
   '22:20',
   '22:58',
   '14:45',
   '13:00',
   '11:00',
   '13:30',
   '14:40',
   '21:43',
   '17:40',
   '0:50',
   '10:30',
   '16:35',
   '17:20',
   '21:20',
   '17:20',
   '14:30',
   '23:20',
   '18:15',
   '13:00',
   '20:14',
   '15:16',
   '14:30',
   '16:30',
   '14:30',
   '19:30',
   '13:15',
   '21:08',
   '20:34',
   '12:16',
   '18:15',
   '19:30',
   '21:04',
   '1:40',
   '18:00',
   '8:30',
   '12:05',
   '11:00',
   '17:00',
   '12:00',
   '20:27',
   '18:25',
   '16:08',
   '11:55',
   '1:30',
   '3:25',
   '8:05',
   '20:13',
   '16:30',
   '20:15',
   '16:37',
   '6:05',
   '8:40',
   '20:15',
   '23:00',
   '20:55',
   '17:25',
   '6:45',
   '12:00',
   '15:53',
   '17:00',
   '19:10',
   '13:55',
   '0:00',
   '16:00',
   '22:25',
   '10:09',
   '0:01',
   '2