In [1]:
from configparser import ConfigParser
import psycopg2
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import recall_score, precision_score
#Can be very helpful to notice any imbalance in classes
from collections import Counter 

In this part, we try to connect to PSQL

In [2]:
#Sourced from https://www.postgresqltutorial.com/postgresql-python/connect/
def config(filename='psql_sample.ini', section='postgresql'):
    parser = ConfigParser()
    # read config file
    parser.read(filename) 
 
    # get section, default to postgresql
    db = {}
    if parser.has_section(section):
        params = parser.items(section)
        for param in params:
            db[param[0]] = param[1]
    else:
        raise Exception('Section {0} not found in the {1} file'.format(section, filename))
 
    return db

In [3]:
#Get the configuration file as a python dictionary
cfg = config()

In [4]:
#Establish the connection and create a cursor to the database
try:
    print("Here's an attempt to connect to the database")
    conn = psycopg2.connect(**cfg)
    cursor = conn.cursor()
    print("Look's like it was a success")
    
except (Exception, psycopg2.DatabaseError) as error:
    print(error)

Here's an attempt to connect to the database
Look's like it was a success


In [5]:
try:
    #Lets get our data 
    cursor.execute("SELECT environment, road_surface,\
                   traffic_control, visibility, impact_type,\
                   total_rain, total_snow, is_intersection, is_fatal from fact_table as fact inner join accident_dimension\
                   as accident on fact.accident_key = accident.accident_key inner join weather_dimension as weather\
                   on fact.weather_key = weather.weather_key"
                  )

    #Get the complete result set. It will be a list of tuples where each tuple is a row from the result set
    result_list = cursor.fetchall()
        
except (Exception, psycopg2.DatabaseError) as error:
    print(error)

In [6]:
#Ensure to run this cell at the end of all your experiments to close all connections
cursor.close()
conn.close()

Let's prepare the DataFrame.

In [7]:
#Now, 
result_df = pd.DataFrame(result_list, columns=["Environment","Road_Surface", "Traffic_Control", 
                                               "Visibility", "impact_type","total_rain","total_snow","Is_Intersection","Is_fatal"])

Let's see how the data looks like

In [8]:
result_df.head()

Unnamed: 0,Environment,Road_Surface,Traffic_Control,Visibility,impact_type,total_rain,total_snow,Is_Intersection,Is_fatal
0,Clear,Wet,Traffic signal,Daylight,Rear end,30.2,0.0,False,False
1,Clear,Dry,Traffic signal,Daylight,Turning movement,0.0,0.0,True,False
2,Clear,Dry,Traffic signal,Daylight,Rear end,0.0,0.0,False,False
3,Snow,Loose snow,Traffic signal,Daylight,Rear end,2.2,21.6,False,False
4,Rain,Wet,Traffic signal,Daylight,Rear end,26.4,0.0,False,False


Let's count the number of instances for each class

In [9]:
Counter(result_df['Is_Intersection'])

Counter({False: 11168, True: 3137})

In [10]:
Counter(result_df['Is_fatal'])

Counter({False: 14277, True: 28})

Both labels are highly imbalanced! We need to make sure to consider this when we test our models.

## let's first work on Is_Intersection, and see how well we can predict whether an accident happened in an intersection or not.

In [11]:
#Get the labels
y = result_df['Is_Intersection']
#Drop the column for the next step
result_df.drop(columns=['Is_Intersection'], inplace=True)
result_df.drop(columns=['Is_fatal'], inplace=True)

In [12]:
result_df.head()# this is our features

Unnamed: 0,Environment,Road_Surface,Traffic_Control,Visibility,impact_type,total_rain,total_snow
0,Clear,Wet,Traffic signal,Daylight,Rear end,30.2,0.0
1,Clear,Dry,Traffic signal,Daylight,Turning movement,0.0,0.0
2,Clear,Dry,Traffic signal,Daylight,Rear end,0.0,0.0
3,Snow,Loose snow,Traffic signal,Daylight,Rear end,2.2,21.6
4,Rain,Wet,Traffic signal,Daylight,Rear end,26.4,0.0


In [13]:
# Our features have categorical data. We need to convert them to one-hot encoders to get the best results.
new_result_df = pd.get_dummies(result_df, prefix=["Environment", "Road_Surface", "Traffic_Control", "Visibility","impact_type"])
new_result_df["total_rain"] = result_df["total_rain"]
new_result_df["total_snow"] = result_df["total_snow"]
#Get the X values
X = new_result_df.values
new_result_df.head()

Unnamed: 0,total_rain,total_snow,Environment_Clear,Environment_Drifting Snow,"Environment_Fog, mist, smoke, dust",Environment_Freezing Rain,Environment_Other,Environment_Rain,Environment_Snow,Environment_Strong wind,...,Visibility_Other,Visibility_Unknown,impact_type_Angle,impact_type_Approaching,impact_type_Other,impact_type_Rear end,impact_type_SMV other,impact_type_SMV unattended vehicle,impact_type_Sideswipe,impact_type_Turning movement
0,30.2,0.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0.0,0.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0.0,0.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,2.2,21.6,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
4,26.4,0.0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0


We are done with pre-processing of our data. Next, we need to devide our data to training and testing sets.

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y)

In [15]:
#Let's see the no. of records per class in training and test set
print("Training set {} ".format(Counter(y_train)))
print("Test set {} ".format(Counter(y_test)))

Training set Counter({False: 8934, True: 2510}) 
Test set Counter({False: 2234, True: 627}) 


Now we train our DecisionTreeClassifier with our training data. There are more classifiers that you can use. Please visit https://scikit-learn.org/stable/index.html.

In [16]:
#This step is to train our classifier based on the training data.
dt = DecisionTreeClassifier(criterion='entropy')
dt.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy')

In [17]:
# Now that our classifier is trained, it is time to test it with the testing data. Because our data is imbalanced, we should loom at precision and recall, rather than accuracy.
y_pred = dt.predict(X_test)
recall = recall_score(y_pred, y_test) * 100
precision = precision_score(y_pred, y_test) * 100
print("Recall of Decision Tree {:.2f} %".format(recall))
print("precision of Decision Tree {:.2f} %".format(precision))

Recall of Decision Tree 74.38 %
precision of Decision Tree 72.25 %


## Now we are done with training and testing the classifier for intersection. The next task is to see whether the accident resulted in fatal or not. We will use the same previous steps.

In [18]:
result_df = pd.DataFrame(result_list, columns=["Environment","Road_Surface", "Traffic_Control", 
                                               "Visibility", "impact_type","total_rain","total_snow","Is_Intersection","Is_fatal"])

In [19]:
y = result_df['Is_fatal']
#Drop the column for the next step
result_df.drop(columns=['Is_fatal'], inplace=True)

In [20]:
result_df.head()

Unnamed: 0,Environment,Road_Surface,Traffic_Control,Visibility,impact_type,total_rain,total_snow,Is_Intersection
0,Clear,Wet,Traffic signal,Daylight,Rear end,30.2,0.0,False
1,Clear,Dry,Traffic signal,Daylight,Turning movement,0.0,0.0,True
2,Clear,Dry,Traffic signal,Daylight,Rear end,0.0,0.0,False
3,Snow,Loose snow,Traffic signal,Daylight,Rear end,2.2,21.6,False
4,Rain,Wet,Traffic signal,Daylight,Rear end,26.4,0.0,False


This time, we are keeping is_intersection as a feature, because it could provide useful information about an accident.

In [21]:
new_result_df = pd.get_dummies(result_df, prefix=["Environment", "Road_Surface", "Traffic_Control", "Visibility","impact_type"])
new_result_df["total_rain"] = result_df["total_rain"]
new_result_df["total_snow"] = result_df["total_snow"]
new_result_df["Is_Intersection"] = result_df["Is_Intersection"]
#Get the X values
X = new_result_df.values
new_result_df.head()

Unnamed: 0,total_rain,total_snow,Is_Intersection,Environment_Clear,Environment_Drifting Snow,"Environment_Fog, mist, smoke, dust",Environment_Freezing Rain,Environment_Other,Environment_Rain,Environment_Snow,...,Visibility_Other,Visibility_Unknown,impact_type_Angle,impact_type_Approaching,impact_type_Other,impact_type_Rear end,impact_type_SMV other,impact_type_SMV unattended vehicle,impact_type_Sideswipe,impact_type_Turning movement
0,30.2,0.0,False,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0.0,0.0,True,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0.0,0.0,False,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,2.2,21.6,False,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
4,26.4,0.0,False,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0


Let's split the data into training and testing sets. 

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y)

In [23]:
#Let's see the no. of records per class in training and test set
print("Training set {} ".format(Counter(y_train)))
print("Test set {} ".format(Counter(y_test)))

Training set Counter({False: 11422, True: 22}) 
Test set Counter({False: 2855, True: 6}) 


In [24]:
dt = DecisionTreeClassifier(criterion='entropy')
dt.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy')

In [25]:
y_pred = dt.predict(X_test)
recall = recall_score(y_pred, y_test) * 100
precision = precision_score(y_pred, y_test) * 100
print("Recall of Decision Tree {:.2f} %".format(recall))
print("precision of Decision Tree {:.2f} %".format(precision))

Recall of Decision Tree 0.00 %
precision of Decision Tree 0.00 %


In [26]:
from sklearn.metrics import accuracy_score
accuracy_score(y_pred,y_test)

0.9968542467668647

### We can see that the accuracy is very high, but the precision and recall scores are 0. This means thath our classifier is only predicting false! For dealing with class-imbalanced, we can use sampling techniques. You can see more of these sampling methods in the link below:
https://imbalanced-learn.org/stable/auto_examples/index.html


### Enjoy your time exploring your data :)