Railroad-Highway Intersection Characteristics Effect on Number of Injuries during Incident

Data Cleaning

In [17]:
#Data Cleaning 
import pandas as pd
import numpy as np
df = pd.read_csv('Highway-Rail_Grade_Crossing_Accident_Data.csv')

#Create Function to make text uniform
def FixText(attribute):
    List = []
    for x in df[attribute]:
        #Check to see if entry is a NaN
        if not pd.isna(x):
            Entry = x.replace(" ","") #get rid of spaces
            Entry = Entry.replace(".","") #get rid of periods
            Entry = Entry.replace("-","") #get rid of dashes
            Entry = Entry.upper() #capitalize all text
            Entry = Entry.replace("ROAD","RD") #abbreviate
            Entry = Entry.replace("STREET","ST") #abbreviate
            Entry = Entry.replace("AVENUE","AVE") #abbreviate
            Entry = Entry.replace("DRIVE","DR") #abbreviate
            Entry = Entry.replace("COURT","CT") #abbreviate
            Entry = Entry.replace("HIGHWAY","HWY") #abbreviate
            Entry = Entry.replace("LANE","LN") #abbreviate
            List.append(Entry) 
        else: 
            List.append(x) #add original entry (NaN)
    df[attribute] = List #replace original attribute with revised
#Fix text for the following attributes
print("Number of unique instances before cleaning 'Street' attribute: " + str(df['Highway Name'].nunique()))
FixText('Highway Name')
print("Number of unique instances after cleaning 'Street' attribute: " + str(df['Highway Name'].nunique()))
FixText('Railroad Name')
FixText('Highway User')
FixText('Maintenance Railroad Name')
FixText('Nearest Station')
FixText('Reporting Parent Railroad Name')
FixText('Reporting Railroad Holding Company')
FixText('Maintenance Parent Railroad Name')
FixText('Maintenance Railroad Holding Company')

#Create list that checks to see if the railroad used Positive Train Control (PTC)
PTC_check_list = []
for x in df['Narrative']:
    if pd.isna(x):
        PTC_check_list.append(0) #If NaN, give 0
    elif 'PTC' in x or 'ptc' in x:
        PTC_check_list.append(1) #If PTC is mentioned, give 1
    else:
        PTC_check_list.append(0) #Else, give 0
df['PTC_check'] = PTC_check_list

df.to_csv('CleanedData.csv') #Save Cleaned Data

  df = pd.read_csv('Highway-Rail_Grade_Crossing_Accident_Data.csv')


Number of unique instances before cleaning 'Street' attribute: 107009
Number of unique instances after cleaning 'Street' attribute: 89374


Preprocess Data

In [18]:
#Using New Dataset (Pre-Processing)
import pandas as pd
import numpy as np
df = pd.read_csv('CleanedData.csv')

from sklearn.preprocessing import OneHotEncoder
#create instance of one-hot-encoder
encoder = OneHotEncoder(handle_unknown='ignore')
def OneHotFix(attribute,labels):
    colmns = df.columns.values.tolist() #Create list of columns
    encoder_df = pd.DataFrame(encoder.fit_transform(df[[attribute]]).toarray()) #Create array using OneHotEncoder of attribute
    final_df = df.join(encoder_df) #Join encoder dataframe to original
    final_df.drop(attribute,axis=1, inplace=True) #Drop the attribute
    newcolms = [x for x in colmns if x != attribute] #create list of new columns
    for x in labels:
        newcolms.append(labels[labels.index(x)]) #append labels to new column list
    final_df.columns = newcolms #replace column names with new list
    return final_df

df['View Obstruction Code'] = df['View Obstruction Code'].replace(9,np.nan) #Clean unassociated code 9 with NaN
df = OneHotFix('View Obstruction Code',['Permanent Structure','Railroad Equipment','Passing Train','Topography','Vegetation','Highway Vehicles','Other Obstruction','Not Obstructed','Unknown Obstruction'])

df['Crossing Illuminated'] = df['Crossing Illuminated'].replace(np.nan,'Unknown') #Combine unknowns and Nans
df = OneHotFix('Crossing Illuminated',['Illuminated-No','Illuminated-Yes','Illuminated-Unknown'])

df['Crossing Warning Location Code'] = df['Crossing Warning Location Code'].replace([np.nan,0,'N'],4.0) #Clean: replace all different null values with unknown code 4
df['Crossing Warning Location Code'] = df['Crossing Warning Location Code'].astype(int) #make all entries integers (some strings)
df = OneHotFix('Crossing Warning Location Code',['Both Sides','Side of Vehicle Approach','Opposite Side of Vehicle Approach','Unknown Side'])

df['Warning Connected To Signal'] = df['Warning Connected To Signal'].replace(np.nan,'Unknown') #combine NaNs and unknowns
df = OneHotFix('Warning Connected To Signal',['Connected To Signal', 'Not Connected To Signal','Unknown If Connected To Signal'])

df = OneHotFix('Public/Private Code',['Private','Public'])

df['Crossing Warning Expanded Code 12'] = df['Crossing Warning Expanded Code 12'].replace(np.nan,11) #Replace NaNs with unknown to combine categories
df = OneHotFix('Crossing Warning Expanded Code 12',['Gates','Cantilever FLS','Standard FLS','Wig wags','Traffic Signals','Audible','Crossbucks','Stop Signs','Watchman','Flagged','Other Warning','No Warning'])

df.to_csv('PreparedData.csv')

  df = pd.read_csv('CleanedData.csv')


Decision Tree (with Unknowns)

In [19]:
#Decision Tree Classification (Crossing Characterisitics)
# ============= With Unknowns ====================
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn import metrics
# identify attributes desired to study
col_names = ['Total Injured Form 55A','Train Speed','Public','Private',
            'Connected To Signal', 'Not Connected To Signal','Unknown If Connected To Signal',
            'Both Sides','Side of Vehicle Approach','Opposite Side of Vehicle Approach','Unknown Side',
            'Illuminated-Yes','Illuminated-No','Illuminated-Unknown','Permanent Structure','Railroad Equipment',
            'Passing Train','Topography','Vegetation','Highway Vehicles','Other Obstruction','Not Obstructed',
            'Unknown Obstruction','Gates','Cantilever FLS','Standard FLS','Wig wags','Traffic Signals','Audible',
            'Crossbucks','Stop Signs','Watchman','Flagged','Other Warning','No Warning']
# load dataset
data = pd.read_csv('PreparedData.csv',usecols=col_names)
# split dataset between features and target variable
col_names.remove('Total Injured Form 55A')
data = data.dropna(axis=0,subset=col_names)
sortedData = data[col_names] # Features
targetVariables = data['Total Injured Form 55A'] # Target Variable

#sortedData.to_csv('Test.csv')

X_train, X_test, y_train, y_test = train_test_split(sortedData,
                                 targetVariables, test_size=0.2, 
                                 random_state=1) # 80% training and 20% test

#Modify test and train lists so that all number of injuries 3+ are in the same category (3)
ii = 0
y_test = y_test.tolist()
mod_y_test = y_test
for x in y_test:
    if x > 3: 
        mod_y_test[ii] = 3 #change to 3
    ii += 1 #cycle index

ii = 0
y_train = y_train.tolist()
mod_y_train = y_train
for x in y_train:
    if x > 3:
        mod_y_train[ii] = 3 #change to 3
    ii += 1 #cycle index

# Create Decision Tree classifier object
clf = DecisionTreeClassifier()
# Train Decision Tree Classifier
clf = clf.fit(X_train,y_train)
#Predict the response for test dataset
y_pred = clf.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

# Generate image of decison tree
from sklearn.tree import export_graphviz
from six import StringIO  
from IPython.display import Image  
import pydotplus
dot_data = StringIO()
export_graphviz(clf, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True,max_depth = 7,
                feature_names = col_names)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png('C:\\Users\\andre\\OneDrive\\Documents\\CU Boulder\\Summer 2022\\CSCI 5502 - Data Mining\\Final Project\\Train_DecisionTree.png')
Image(graph.create_png())

from sklearn.metrics import classification_report, confusion_matrix

clf_cv_score = cross_val_score(clf, sortedData, targetVariables, cv = 10)

#Print out results
print("=== Confusion Matrix ===")
print(confusion_matrix(mod_y_test,y_pred))
print('\n')
print("=== Classification Report ===")
print(classification_report(mod_y_test, y_pred))
print('\n')
print("=== All AUC Scores ===")
print(clf_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Decision Tree: ", clf_cv_score.mean())

Accuracy: 0.7157782782782782




=== Confusion Matrix ===
[[34072   725    82    35]
 [10008   241    32    10]
 [ 1833    59     8     1]
 [  823    20     1     2]]


=== Classification Report ===
              precision    recall  f1-score   support

           0       0.73      0.98      0.83     34914
           1       0.23      0.02      0.04     10291
           2       0.07      0.00      0.01      1901
           3       0.04      0.00      0.00       846

    accuracy                           0.72     47952
   macro avg       0.27      0.25      0.22     47952
weighted avg       0.58      0.72      0.62     47952



=== All AUC Scores ===
[0.71579913 0.7166333  0.71571572 0.71571572 0.71300467 0.71350517
 0.71500667 0.71253389 0.71495308 0.71345151]


=== Mean AUC Score ===
Mean AUC Score - Decision Tree:  0.7146318857961194


Decision Tree (without unknowns)

In [20]:
#Decision Tree Classification (Crossing Characterisitics)
# ============= Getting Rid of Rows with Unknowns ====================
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn import metrics
# identify attributes desired to study
col_names = ['Total Injured Form 55A','Train Speed','Public', 'Connected To Signal', 
            'Both Sides','Side of Vehicle Approach','Opposite Side of Vehicle Approach',
            'Illuminated-Yes','Illuminated-No','Permanent Structure','Railroad Equipment',
            'Passing Train','Topography','Vegetation','Highway Vehicles','Not Obstructed',
            'Gates','Cantilever FLS','Standard FLS','Wig wags','Traffic Signals','Audible',
            'Crossbucks','Stop Signs','Watchman','Flagged','No Warning']
# load dataset
data = pd.read_csv('PreparedData.csv',usecols=col_names)
# split dataset between features and target variable
col_names.remove('Total Injured Form 55A')
data = data.dropna(axis=0,subset=col_names)
sortedData = data[col_names] # Features
targetVariables = data['Total Injured Form 55A'] # Target Variable

#sortedData.to_csv('Test.csv')

X_train, X_test, y_train, y_test = train_test_split(sortedData,
                                 targetVariables, test_size=0.2, 
                                 random_state=1) # 80% training and 20% test

#Modify test and train lists so that all number of injuries 3+ are in the same category (3)
ii = 0
y_test = y_test.tolist()
mod_y_test = y_test
for x in y_test:
    if x > 2:
        mod_y_test[ii] = 3
    ii += 1

ii = 0
y_train = y_train.tolist()
mod_y_train = y_train
for x in y_train:
    if x > 2:
        mod_y_train[ii] = 3
    ii += 1

# Create Decision Tree classifier object
clf = DecisionTreeClassifier()
# Train Decision Tree Classifier
clf = clf.fit(X_train,y_train)
#Predict the response for test dataset
y_pred = clf.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

# Generate image of decison tree
from sklearn.tree import export_graphviz
from six import StringIO  
from IPython.display import Image  
import pydotplus
dot_data = StringIO()
export_graphviz(clf, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True,max_depth = 7,
                feature_names = col_names)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png('C:\\Users\\andre\\OneDrive\\Documents\\CU Boulder\\Summer 2022\\CSCI 5502 - Data Mining\\Final Project\\Train_DecisionTree_NoUnknowns.png')
Image(graph.create_png())

from sklearn.metrics import classification_report, confusion_matrix

clf_cv_score = cross_val_score(clf, sortedData, targetVariables, cv = 10)

#Print out results
print("=== Confusion Matrix ===")
print(confusion_matrix(mod_y_test,y_pred))
print('\n')
print("=== Classification Report ===")
print(classification_report(mod_y_test, y_pred, digits = 4))
print('\n')
print("=== All AUC Scores ===")
print(clf_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Decision Tree: ", clf_cv_score.mean())

Accuracy: 0.7164664664664665




=== Confusion Matrix ===
[[34115   679    78    42]
 [10018   232    35     6]
 [ 1834    59     7     1]
 [  822    19     3     2]]


=== Classification Report ===
              precision    recall  f1-score   support

           0     0.7291    0.9771    0.8351     34914
           1     0.2346    0.0225    0.0411     10291
           2     0.0569    0.0037    0.0069      1901
           3     0.0392    0.0024    0.0045       846

    accuracy                         0.7165     47952
   macro avg     0.2650    0.2514    0.2219     47952
weighted avg     0.5842    0.7165    0.6172     47952



=== All AUC Scores ===
[0.71692526 0.71646647 0.71659159 0.71446446 0.71429763 0.71367201
 0.71500667 0.71311783 0.71637122 0.71436913]


=== Mean AUC Score ===
Mean AUC Score - Decision Tree:  0.7151282276367436


Random Forest (with unknown)

In [22]:
#Random Forest Classification (Crossing Characterisitics)
# ============= With Unknowns ====================

import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier

# identify attributes desired to study
col_names = ['Total Injured Form 55A','Train Speed','Public','Private',
            'Connected To Signal', 'Not Connected To Signal','Unknown If Connected To Signal',
            'Both Sides','Side of Vehicle Approach','Opposite Side of Vehicle Approach','Unknown Side',
            'Illuminated-Yes','Illuminated-No','Illuminated-Unknown','Permanent Structure','Railroad Equipment',
            'Passing Train','Topography','Vegetation','Highway Vehicles','Other Obstruction','Not Obstructed',
            'Unknown Obstruction','Gates','Cantilever FLS','Standard FLS','Wig wags','Traffic Signals','Audible',
            'Crossbucks','Stop Signs','Watchman','Flagged','Other Warning','No Warning']
# load dataset
data = pd.read_csv('PreparedData.csv',usecols=col_names)
# split dataset between features and target variable
col_names.remove('Total Injured Form 55A')
data = data.dropna(axis=0,subset=col_names)
X = data[col_names] # Features
y = data['Total Injured Form 55A'] # Target Variable

X_train, X_test, y_train, y_test = train_test_split(X,
                                 y, test_size=0.2, 
                                 random_state=1) # 80% training and 20% test

#Modify test and train lists so that all number of injuries 3+ are in the same category (3)
ii = 0
y_test = y_test.tolist()
mod_y_test = y_test
for x in y_test:
    if x > 2:
        mod_y_test[ii] = 3
    ii += 1

ii = 0
y_train = y_train.tolist()
mod_y_train = y_train
for x in y_train:
    if x > 2:
        mod_y_train[ii] = 3
    ii += 1

#Create Random Forest Model
rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)

#Prediction
rfc_predict = rfc.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix

rfc_cv_score = cross_val_score(rfc, X, y, cv = 10)

#Print out results
print("=== Confusion Matrix ===")
print(confusion_matrix(mod_y_test,rfc_predict))
print('\n')
print("=== Classification Report ===")
print(classification_report(mod_y_test, rfc_predict, digits = 4))
print('\n')
print("=== All AUC Scores ===")
print(rfc_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Random Forest: ", rfc_cv_score.mean())



=== Confusion Matrix ===
[[33946   828    93    47]
 [ 9995   243    38    15]
 [ 1823    67     8     3]
 [  818    23     2     3]]


=== Classification Report ===
              precision    recall  f1-score   support

           0     0.7287    0.9723    0.8331     34914
           1     0.2093    0.0236    0.0424     10291
           2     0.0567    0.0042    0.0078      1901
           3     0.0441    0.0035    0.0066       846

    accuracy                         0.7132     47952
   macro avg     0.2597    0.2509    0.2225     47952
weighted avg     0.5785    0.7132    0.6161     47952



=== All AUC Scores ===
[0.71650817 0.71559059 0.71550717 0.71442276 0.71133634 0.71250417
 0.71442276 0.71111575 0.71411887 0.71199166]


=== Mean AUC Score ===
Mean AUC Score - Random Forest:  0.7137518235997555


Random Forest (without unknown)

In [23]:
#Random Forest Classification (Crossing Characterisitics)
# ============= Without Unknowns ====================

import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier

# identify attributes desired to study
col_names = ['Total Injured Form 55A','Train Speed','Public','Private',
            'Connected To Signal', 'Not Connected To Signal',
            'Both Sides','Side of Vehicle Approach','Opposite Side of Vehicle Approach',
            'Illuminated-Yes','Illuminated-No','Permanent Structure','Railroad Equipment',
            'Passing Train','Topography','Vegetation','Highway Vehicles','Not Obstructed',
            'Gates','Cantilever FLS','Standard FLS','Wig wags','Traffic Signals','Audible',
            'Crossbucks','Stop Signs','Watchman','Flagged','No Warning']
# load dataset
data = pd.read_csv('PreparedData.csv',usecols=col_names)
# split dataset between features and target variable
col_names.remove('Total Injured Form 55A')
data = data.dropna(axis=0,subset=col_names)
X = data[col_names] # Features
y = data['Total Injured Form 55A'] # Target Variable

X_train, X_test, y_train, y_test = train_test_split(X,
                                 y, test_size=0.2, 
                                 random_state=1) # 80% training and 20% test

#Modify test and train lists so that all number of injuries 3+ are in the same category (3)
ii = 0
y_test = y_test.tolist()
mod_y_test = y_test
for x in y_test:
    if x > 2:
        mod_y_test[ii] = 3
    ii += 1

ii = 0
y_train = y_train.tolist()
mod_y_train = y_train
for x in y_train:
    if x > 2:
        mod_y_train[ii] = 3
    ii += 1

#Create Random Forest Model
rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)

#Prediction
rfc_predict = rfc.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix

rfc_cv_score = cross_val_score(rfc, X, y, cv = 10)

#Print out results
print("=== Confusion Matrix ===")
print(confusion_matrix(mod_y_test,rfc_predict))
print('\n')
print("=== Classification Report ===")
print(classification_report(mod_y_test, rfc_predict, digits = 4))
print('\n')
print("=== All AUC Scores ===")
print(rfc_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Random Forest: ", rfc_cv_score.mean())



=== Confusion Matrix ===
[[33984   800    92    38]
 [10006   245    31     9]
 [ 1822    70     8     1]
 [  816    25     3     2]]


=== Classification Report ===
              precision    recall  f1-score   support

           0     0.7288    0.9734    0.8335     34914
           1     0.2149    0.0238    0.0429     10291
           2     0.0597    0.0042    0.0079      1901
           3     0.0400    0.0024    0.0045       846

    accuracy                         0.7140     47952
   macro avg     0.2609    0.2509    0.2222     47952
weighted avg     0.5799    0.7140    0.6165     47952



=== All AUC Scores ===
[0.71700868 0.71650817 0.71534034 0.71404738 0.71204538 0.71400567
 0.7151318  0.71157456 0.71428571 0.71307612]


=== Mean AUC Score ===
Mean AUC Score - Random Forest:  0.7143023812829166
