# Importing Libraries

In [None]:
!pip install swifter
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import swifter

# Collecting Data

In [None]:
original_data = pd.read_csv('US_Accidents_Dec21_updated.csv')
original_data.head()

# Dropping unnecessary/empty columns

In [None]:
data = original_data.copy(deep=True)
del data['Country']
del data['Turning_Loop']

# Checking imbalance in dataset for Severity

In [None]:
# Display the graph and setting the figure size.
plt.rcParams["figure.figsize"] = [10, 6]
plt.rcParams["font.size"] = 15

severity_norm = data['Severity'].value_counts(normalize=True)

plt.bar(severity_norm.index, severity_norm.values)
plt.xlabel("Severity")
plt.ylabel("Normalized frequency")
plt.title("Types of severity")
plt.show()

# Distribution of Wind Directions

In [None]:
data['Wind_Direction'].value_counts().plot.bar();
plt.rcParams["figure.figsize"] = [14, 6]
plt.xlabel("Wind Directions");
plt.ylabel("Frequency");
plt.title("Types of directions");

# Distribution of Weather Conditions

In [None]:
fig, ax=plt.subplots(figsize=(16,7))
data['Weather_Condition'].value_counts().sort_values(ascending=False).head(5).plot.bar(width=0.5,edgecolor='k',align='center',linewidth=2)
plt.xlabel('Weather_Condition',fontsize=20)
plt.ylabel('Number of Accidents',fontsize=20)
ax.tick_params(labelsize=20)
plt.title('Top Weather Conditions for accidents',fontsize=25)
plt.grid()
plt.ioff()

# State-wise distribution

In [None]:
states = pd.read_excel('states.xlsx')
states = states.drop(columns=['state'])
dictionary_pop = states.set_index('code')['pop_2014'].to_dict()
df_freq = pd.DataFrame(original_data.groupby('State').size())
dictionary_freq = dict()
for s in list(df_freq.index):
    dictionary_freq[s] = df_freq.loc[s][0]
    
dict_final = dict()
for k in dictionary_freq.keys():
    dict_final[k] = dictionary_freq[k]/dictionary_pop[k]
    
x_values = list(dict_final.keys())
y_values = list(dict_final.values())
plt.figure(figsize=(17,8))
plt.bar(x_values,y_values)

# Roll-up operations on City, County, State

In [None]:
def roll_up(df, col):
    level=-1
    if col=='Country':
        level=4
    elif col=='State':
        level=3
    elif col=='County':
        level=2
    elif col=='City':
        level=1

    df = df.iloc[:,level-1:]
    cols_to_group = df.columns[:1]
    cols_to_agg = df.columns[1:]
    rolled_up_df = df.groupby(list(cols_to_group))
    rolled_up_df = rolled_up_df[cols_to_agg].mean().reset_index()
    return rolled_up_df

rolled_up_df = roll_up(df, 'County')
print(len(rolled_up_df))
print(rolled_up_df)


# Kernel Distribution Estimate Plot of Humidity vs Severity

In [None]:
sns.displot(data, x="Humidity(%)", hue="Severity",kind='kde', palette="Set1", height=7);

# Calculating Duration of Accidents

In [None]:
from datetime import datetime

def duration(row):
    s1 = row['Start_Time']
    s2 = row['End_Time']
    if '.' in s1:
        i = s1.index('.')
        s1 = s1[:i]
        
    if '.' in s2:
        i = s2.index('.')
        s2 = s2[:i]
        
    start = datetime.strptime(s1,'%Y-%m-%d %H:%M:%S')
    end = datetime.strptime(s2,'%Y-%m-%d %H:%M:%S')
    return end-start


data['Duration'] = data[['Start_Time','End_Time']].swifter.apply(duration,axis=1)
data['Duration'].head()

## Extracting hours from duration

In [None]:
def convert_hours(td):
    return td.total_seconds()/3600

data['Duration_hours'] = data['Duration'].swifter.apply(convert_hours)
data['Duration_hours']

## Plotting Durations and their frequencies

In [None]:
plt.figure(figsize=(15, 10))
values, bins, bars = plt.hist(exp4['Duration_hours'], bins=[0,1,2,3,4,5,6,7,8,9,10],edgecolor='white')
plt.xlabel("Hours")
plt.ylabel("Frequency")
plt.title = ('Hours/Frequency')
plt.bar_label(bars, fontsize=20, color='navy')
plt.margins(x=0.01, y=0.1)
plt.show()

Notes about sorted_hours:

Total records: 2845342

1. Records<1402: 2844158 >1402: 1183 (99.96%)
2. Records<70: 2841284 >70: 2874 (99.86%)
3. Records<7: 2697063 >7: 143264 (94.79%)
4. Records<1: 618632 >1: 1591783 (21.74%)

# Monthly distribution of accidents

In [None]:
def month(d):
    return d[5:7]
data['Month'] = data['Start_Time'].swifter.apply(month)
d = data['Month'].value_counts().to_dict()
months = list(d.keys())
frequencies = list(d.values())
values = []
for k in sorted(d):
    values.append(d[k])
    
plt.bar(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'],values)

# Yearly increase in accidents

In [None]:
def year(d):
    return d[:4]

data['Year'] = data['Start_Time'].swifter.apply(year)

exp = data[['Year','Month']]
exp = exp.sort_values(['Year', 'Month'],
              ascending = [True, True])

grouped = exp.groupby(['Year','Month']).size()
time_index = list(grouped.index)
l = list(grouped[:])

plt.plot(l)

# [('2016','01'),('2016','12'),('2017','01'),('2017','12'),('2018','01'),('2018','12'),('2019','01'),('2019','12'),('2020','01'),('2020','12'),('2021','01'),('2021','12')]

for i in range(len(l)):
    if time_index[i] in [('2016','01'),('2017','01'),('2018','01'),('2019','01'),('2020','01'),('2021','01')]:
        plt.text(i,l[i],time_index[i][0])
        plt.scatter(i,l[i])
        
plt.ylabel('Frequency')

# Hourly Distribution of accidents

In [None]:
hourly_df = data[['Start_Time']]

def extract_hours(s):
    return int(s[11:13])

hourly_df['Hour'] = hourly_df['Start_Time'].swifter.apply(extract_hours)
grouped_hrs = hourly_df.groupby(['Hour']).size()
plt.plot(grouped_hrs)
plt.grid()
plt.xlabel('Hour')
plt.ylabel('Frequency')
plt.xticks([i for i in range(len(grouped_hrs))])

# Calculating latitude and longitude of accident by averaging start and end latitude and longitude

In [None]:
def calc_latlng(row):
    lat = (row['Start_Lat']+row['End_Lat'])/2
    lng = (row['Start_Lng']+row['End_Lng'])/2
    return lat,lng

combo = data[['Start_Lat','End_Lat','Start_Lng','End_Lng']].swifter.apply(calc_latlng,axis=1)
latitudes = []
longitudes = []

for item in combo:
    latitudes.append(item[0])
    longitudes.append(item[1])
    
print(latitudes)
print(longitudes)

data['Latitude'] = latitudes
data['Longitude'] = longitudes

data[['Latitude','Longitude']]

# Global accidents

In [None]:
!pip install datashader
import datashader as ds, pandas as pd, colorcet as cc
import holoviews as hv
from holoviews.element.tiles import EsriImagery,EsriUSATopo,EsriTerrain,CartoMidnight,StamenWatercolor,StamenTonerBackground
from holoviews.operation.datashader import datashade
from holoviews.element import tiles as hvts

#longitudes,latitudes = ds.utils.lnglat_to_meters(data['Longitude'],data['Latitude'])
hv.extension('bokeh')

map_tiles  = EsriImagery().opts(alpha=0.5, width=900, height=600, bgcolor='black')
points = hv.Points(ds.utils.lnglat_to_meters(data['Longitude'], data['Latitude']))
hvts.StamenLabels().options(level='annotation', alpha=1)
print(points)

us_accidents = datashade(points, x_sampling=1, y_sampling=1, cmap=cc.bmw, width=900, height=600)

map_tiles * us_accidents

# Precipitation effects

In [None]:
prec = data[data['Precipitation(in)']!=0.0]

plt.hist(prec['Precipitation(in)'],bins=[0,0.01,0.02,0.03,0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8])
plt.xlabel('Precipitation(inches)')
plt.ylabel('Frequency')

# Taking Numerical columns for predicting Severity

In [None]:
numerical_data = data[['Distance(mi)',
       'Temperature(F)', 'Wind_Chill(F)', 'Humidity(%)', 'Pressure(in)',
       'Visibility(mi)', 'Wind_Speed(mph)',
       'Precipitation(in)', 'Sunrise_Sunset', 'Civil_Twilight',
       'Nautical_Twilight', 'Astronomical_Twilight']]

def daynight(s):
    if s=='Night':
        return 0
    elif s=='Day':
        return 1
    else:
        return -1

numerical_data['Sunrise_Sunset'] = numerical_data['Sunrise_Sunset'].swifter.apply(daynight)
numerical_data['Civil_Twilight'] = numerical_data['Civil_Twilight'].swifter.apply(daynight)
numerical_data['Nautical_Twilight'] = numerical_data['Nautical_Twilight'].swifter.apply(daynight)
numerical_data['Astronomical_Twilight'] = numerical_data['Astronomical_Twilight'].swifter.apply(daynight)

for col in ['Distance(mi)',
       'Temperature(F)', 'Wind_Chill(F)', 'Humidity(%)', 'Pressure(in)',
       'Visibility(mi)', 'Wind_Speed(mph)',
       'Precipitation(in)', 'Sunrise_Sunset', 'Civil_Twilight',
       'Nautical_Twilight', 'Astronomical_Twilight']:
    
    
    numerical_data[col] = numerical_data[col].fillna(numerical_data[col].mean())

target = data['Severity']

x_train, x_test, y_train, y_test = train_test_split(
    numerical_data, target, random_state=42, test_size=0.25)

# Case 1: Predicting Severity using all numerical columns

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

rf_base = RandomForestClassifier()
grid = {'n_estimators': [10, 50, 100],
        'max_features': ['auto','sqrt']}
clf_rf_base = GridSearchCV(rf_base, grid, cv=5, n_jobs=8, scoring='f1_macro')

clf_rf_base.fit(x_train, y_train)
y_pred_rf_base = clf_rf_base.predict(x_test)

print (classification_report(y_test, y_pred_rf_base))

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

log_base = LogisticRegression()

# log_base = GridSearchCV(log_base,cv=5, n_jobs=8, scoring='f1_macro')

log_base.fit(x_train, y_train)
y_pred_log_base = log_base.predict(x_test)

print(classification_report(y_test, y_pred_log_base))

## Support Vector Classifier

In [None]:
from sklearn.svm import SVC

svm_base = SVC(gamma='auto')

svm_base.fit(x_train, y_train)
y_pred_svm_base = svm_base.predict(x_test)

print (classification_report(y_test, y_pred_svm_base))

## K-Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3)

knn.fit(x_train, y_train)
y_pred_knn_3 = knn.predict(x_test)

print (classification_report(y_test, y_pred_knn_3))

## Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

nb_base = GaussianNB()

nb_base.fit(x_train, y_train)
y_pred_nb_base = nb_base.predict(x_test)

print (classification_report(y_test, y_pred_nb_base))

## Multi-layer Perceptron

In [None]:
from sklearn.neural_network import MLPClassifier

mlp_15_5 = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(15, 5), random_state=1)

mlp_15_5.fit(x_train, y_train)
y_pred_mlp_15_5 = mlp_15_5.predict(x_test)

print (classification_report(y_test, y_pred_mlp_15_5))

# Checking correlated columns

In [None]:
import seaborn as sns
hm = numerical_data.corr().abs()
hm[hm>0.8]
sns.heatmap(hm[hm>0.8])

# Removing correlated columns

In [None]:
#Removing columns that are correlated by more than 80%
numerical_data = numerical_data.drop(columns=['Wind_Chill(F)','Civil_Twilight','Nautical_Twilight','Astronomical_Twilight'])

# Case 2: Predicting Severity using un-correlated columns

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

rf_base = RandomForestClassifier()
grid = {'n_estimators': [10, 50, 100],
        'max_features': ['auto','sqrt']}
clf_rf_base = GridSearchCV(rf_base, grid, cv=5, n_jobs=8, scoring='f1_macro')

clf_rf_base.fit(x_train, y_train)
y_pred_rf_base = clf_rf_base.predict(x_test)

print (classification_report(y_test, y_pred_rf_base))

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

log_base = LogisticRegression()

# log_base = GridSearchCV(log_base,cv=5, n_jobs=8, scoring='f1_macro')

log_base.fit(x_train, y_train)
y_pred_log_base = log_base.predict(x_test)

print(classification_report(y_test, y_pred_log_base))

## Support Vector Classifier

In [None]:
from sklearn.svm import SVC

svm_base = SVC(gamma='auto')

svm_base.fit(x_train, y_train)
y_pred_svm_base = svm_base.predict(x_test)

print (classification_report(y_test, y_pred_svm_base))

## K-Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3)

knn.fit(x_train, y_train)
y_pred_knn_3 = knn.predict(x_test)

print (classification_report(y_test, y_pred_knn_3))

## Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

nb_base = GaussianNB()

nb_base.fit(x_train, y_train)
y_pred_nb_base = nb_base.predict(x_test)

print (classification_report(y_test, y_pred_nb_base))

## Multi-layer Perceptron

In [None]:
from sklearn.neural_network import MLPClassifier

mlp_15_5 = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(15, 5), random_state=1)

mlp_15_5.fit(x_train, y_train)
y_pred_mlp_15_5 = mlp_15_5.predict(x_test)

print (classification_report(y_test, y_pred_mlp_15_5))

# Case 3: Predicting Severity using un-correlated columns as well as Side and Timezone

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

rf_base = RandomForestClassifier()
grid = {'n_estimators': [10, 50, 100],
        'max_features': ['auto','sqrt']}
clf_rf_base = GridSearchCV(rf_base, grid, cv=5, n_jobs=8, scoring='f1_macro')

clf_rf_base.fit(x_train, y_train)
y_pred_rf_base = clf_rf_base.predict(x_test)

print (classification_report(y_test, y_pred_rf_base))

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

log_base = LogisticRegression()

# log_base = GridSearchCV(log_base,cv=5, n_jobs=8, scoring='f1_macro')

log_base.fit(x_train, y_train)
y_pred_log_base = log_base.predict(x_test)

print(classification_report(y_test, y_pred_log_base))

## Support Vector Classifier

In [None]:
from sklearn.svm import SVC

svm_base = SVC(gamma='auto')

svm_base.fit(x_train, y_train)
y_pred_svm_base = svm_base.predict(x_test)

print (classification_report(y_test, y_pred_svm_base))

## K-Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3)

knn.fit(x_train, y_train)
y_pred_knn_3 = knn.predict(x_test)

print (classification_report(y_test, y_pred_knn_3))

## Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

nb_base = GaussianNB()

nb_base.fit(x_train, y_train)
y_pred_nb_base = nb_base.predict(x_test)

print (classification_report(y_test, y_pred_nb_base))

## Multi-layer Perceptron

In [None]:
from sklearn.neural_network import MLPClassifier

mlp_15_5 = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(15, 5), random_state=1)

mlp_15_5.fit(x_train, y_train)
y_pred_mlp_15_5 = mlp_15_5.predict(x_test)

print (classification_report(y_test, y_pred_mlp_15_5))

# Conducting statistical tests

## Friedman's chi2 test

In [None]:
from scipy.stats import friedmanchisquare
import numpy as np

#List of accuracy scores for four different models taken from the accuracies obtained above
rf_acc = [0.76, 0.90, 0.91]
log_acc = [0.73, 0.89, 0.89]
mlp_acc = [0.65,0.86,0.80]
nb_acc = [0.70,0.72,0.89]

# Concatenate the accuracy scores for each model into a 2D array
accuracy_scores = np.concatenate([rf_acc, log_acc, nb_acc, mlp_acc]).reshape(4,3)

# Perform Friedman's test
f_value, p_value = friedmanchisquare(*accuracy_scores)

print("Friedman's test statistic:", f_value)
print("P-value:", p_value)

#model_scores = [0.76,0.73,0.74,0.65,0.70,0.73]

## Kruskal-Wallis test

In [None]:
from scipy.stats import kruskal
kw_stat, p_value = kruskal(*accuracy_scores)
print("P-value:", p_value)

## ANOVA test

In [None]:
from scipy.stats import f_oneway

f_stat, p_value = f_oneway(*accuracy_scores)
print("P-value:", p_value)

## Bon-ferroni test

### Taking accuracies from different cases

In [None]:
from statsmodels.stats.multitest import multipletests

rf_acc = [0.76, 0.90, 0.91]
log_acc = [0.73, 0.89, 0.89]
mlp_acc = [0.65,0.86,0.80]
nb_acc = [0.70,0.72,0.89]

case1 = [0.76,0.73,0.65,0.70] #corresponds to the case where we use all numerical columns
case2 = [0.9,0.89,0.86,0.72] #corresponds to the case where we use un-correlated columns
case3 = [0.91,0.89,0.8,0.89] #corresponds to the case where we use un-correlated columns as well as two more categorical columns

### Case 1:

In [None]:
alpha = 0.05
p_values = [1 - model_score for model_score in case1]
reject, corrected_p_values, _, _ = multipletests(p_values, alpha=alpha, method='bonferroni')
corrected_p_values

### Case 2:

In [None]:
alpha = 0.05
p_values = [1 - model_score for model_score in case2]
reject, corrected_p_values, _, _ = multipletests(p_values, alpha=alpha, method='bonferroni')
corrected_p_values

### Case 3:

In [None]:
alpha = 0.05
p_values = [1 - model_score for model_score in case3]
reject, corrected_p_values, _, _ = multipletests(p_values, alpha=alpha, method='bonferroni')
corrected_p_values

# Using Natural Language Processing to predict severity

In [None]:
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
rel = data[['Description','Severity']]

from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re,nltk

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

rel['Mod1'] = rel['Description'].apply(lambda x: x.lower())
rel['Mod2'] = rel['Mod1'].apply(lambda x: re.sub('[^a-zA-Z\s]', '', x))
rel['Mod3'] = rel['Mod2'].apply(lambda x: x.split())
stop_words = set(stopwords.words('english'))
rel['Mod4'] = rel['Mod3'].apply(lambda x: [word for word in x if word not in stop_words])
rel['Mod5'] = rel['Mod4'].apply(lambda x: ' '.join(x))

from sklearn.model_selection import train_test_split

tfidf_vectorizer = TfidfVectorizer(max_features=100)

# Fit and transform the text data to a TF-IDF representation
tfidf_matrix = tfidf_vectorizer.fit_transform(rel['Mod5'])

X_train,X_test,y_train,y_test = train_test_split(tfidf_matrix,rel['Severity'], test_size=0.2, random_state=22)
X_train = X_train.todense()
X_test = X_test.todense()

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

# Create a logistic regression model
logreg_model = LogisticRegression()

# Train the model on the TF-IDF matrix and the corresponding labels
logreg_model.fit(X_train, y_train)

# Make predictions on new text data

# Transform the new text data to a TF-IDF representation
#new_tfidf_matrix = tfidf_vectorizer.transform(X_test)

# Make predictions on the new TF-IDF matrix
predictions = logreg_model.predict(X_test)

# Print the predictions
print(predictions)

from sklearn.metrics import accuracy_score, classification_report

print(accuracy_score(predictions,y_test))

print(classification_report(predictions,y_test))

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Create a Random Forest model
rf_model = RandomForestClassifier()

# Train the model on the TF-IDF matrix and the corresponding labels
rf_model.fit(X_train, y_train)

# Make predictions on new text data

# Transform the new text data to a TF-IDF representation
#new_tfidf_matrix = tfidf_vectorizer.transform(X_test)

# Make predictions on the new TF-IDF matrix
rf_predictions = rf_model.predict(X_test)

# Print the predictions
#print(rf_predictions)

from sklearn.metrics import accuracy_score, classification_report

print(accuracy_score(rf_predictions,y_test))

print(classification_report(rf_predictions,y_test))

## Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

# Create a Random Forest model
nb_model = GaussianNB()

# Train the model on the TF-IDF matrix and the corresponding labels
nb_model.fit(X_train, y_train)

# Make predictions on new text data

# Transform the new text data to a TF-IDF representation
#new_tfidf_matrix = tfidf_vectorizer.transform(X_test)

# Make predictions on the new TF-IDF matrix
nb_predictions = nb_model.predict(X_test)

# Print the predictions
#print(rf_predictions)

from sklearn.metrics import accuracy_score, classification_report

print(accuracy_score(nb_predictions,y_test))

print(classification_report(nb_predictions,y_test))

# K-Means Clustering on Location

In [None]:
clust = data.copy(deep=True)
clust = clust.dropna(subset=['Latitude', 'Longitude'])

# Extract the 'Start_Lat' and 'Start_Lng' columns
X = clust[['Latitude', 'Longitude']]

# Determine the optimal number of clusters using the elbow method
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)

# Fit the k-means model to the data with the optimal number of clusters
kmeans = KMeans(n_clusters=4, init='k-means++', random_state=42)
y_kmeans = kmeans.fit_predict(X)

# Visualize the clusters
plt.scatter(X[y_kmeans == 0]['Longitude'],X[y_kmeans == 0]['Latitude'], c = 'red', label = 'Cluster 1',s=1)
plt.scatter(X[y_kmeans == 1]['Longitude'],X[y_kmeans == 1]['Latitude'], c = 'blue', label = 'Cluster 2',s=1)
plt.scatter(X[y_kmeans == 2]['Longitude'],X[y_kmeans == 2]['Latitude'], c = 'green', label = 'Cluster 3',s=1)
plt.scatter(X[y_kmeans == 3]['Longitude'],X[y_kmeans == 3]['Latitude'], c = 'cyan', label = 'Cluster 4',s=1)
plt.scatter(kmeans.cluster_centers_[:, 1],kmeans.cluster_centers_[:, 0], c = 'yellow', label = 'Centroids',s=5)
plt.title('Clusters of Accidents')
plt.xlabel('Latitude')
plt.ylabel('Longitude')
plt.legend()
plt.show()

# Association Rule Mining

In [None]:
!pip install mlxtend
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

# Load the dataset into a pandas DataFrame
df = pd.read_csv('Checkpoint1.csv')
df['Severity'] = df['Severity'].astype(str)
# Select the accident features to consider
features = ['Traffic_Signal', 'Crossing', 'Junction', 'Stop', 'Amenity', 'Bump', 'Give_Way', 'No_Exit', 'Railway', 'Station','Severity']

# Convert the features to binary indicators
df_features = pd.get_dummies(df[features])

# Find frequent combinations of features using the Apriori algorithm
frequent_itemsets = apriori(df_features, min_support=0.01, use_colnames=True)

# Generate association rules between the frequent itemsets
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)

In [None]:
frequent_itemsets

In [None]:
rules