Pre-Process Data (using full dataset)

In [None]:
#Using New Dataset (Pre-Processing)
import pandas as pd
import numpy as np
df = pd.read_csv('FullCrossingData.csv')

import sklearn
from sklearn.preprocessing import OneHotEncoder
#create instance of one-hot-encoder
encoder = OneHotEncoder(handle_unknown='ignore')
def OneHotFix(attribute,labels):
    colmns = df.columns.values.tolist()
    encoder_df = pd.DataFrame(encoder.fit_transform(df[[attribute]]).toarray())
    final_df = df.join(encoder_df)
    final_df.drop(attribute,axis=1, inplace=True)
    newcolms = [x for x in colmns if x != attribute]
    for x in labels:
        newcolms.append(labels[labels.index(x)])
    final_df.columns = newcolms
    return final_df

df['View Obstruction Code'] = df['View Obstruction Code'].replace(9,np.nan)
df = OneHotFix('View Obstruction Code',['Permanent Structure','Railroad Equipment','Passing Train','Topography','Vegetation','Highway Vehicles','Other Obstruction','Not Obstructed','Unknown Obstruction'])

df['Crossing Illuminated'] = df['Crossing Illuminated'].replace(np.nan,'Unknown')
df = OneHotFix('Crossing Illuminated',['Illuminated-No','Illuminated-Yes','Illuminated-Unknown'])

df['Crossing Warning Location Code'] = df['Crossing Warning Location Code'].replace([np.nan,0,'N'],4.0)
df['Crossing Warning Location Code'] = df['Crossing Warning Location Code'].astype(int)
df = OneHotFix('Crossing Warning Location Code',['Both Sides','Side of Vehicle Approach','Opposite Side of Vehicle Approach','Unknown Side'])

df['Warning Connected To Signal'] = df['Warning Connected To Signal'].replace(np.nan,'Unknown')
df = OneHotFix('Warning Connected To Signal',['Connected To Signal', 'Not Connected To Signal','Unknown If Connected To Signal'])

df = OneHotFix('Public/Private Code',['Private','Public'])

df['Crossing Warning Expanded Code 12'] = df['Crossing Warning Expanded Code 12'].replace(np.nan,11)
df = OneHotFix('Crossing Warning Expanded Code 12',['Gates','Cantilever FLS','Standard FLS','Wig wags','Traffic Signals','Audible','Crossbucks','Stop Signs','Watchman','Flagged','Other Warning','No Warning'])

df.to_csv('RevisedData.csv')

Decision Tree (with Unknowns)

In [None]:
#Decision Tree Classification (Crossing Characterisitics)
# ============= With Unknowns ====================
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn import metrics
# identify attributes desired to study
col_names = ['Total Injured Form 55A','Train Speed','Public','Private',
            'Connected To Signal', 'Not Connected To Signal','Unknown If Connected To Signal',
            'Both Sides','Side of Vehicle Approach','Opposite Side of Vehicle Approach','Unknown Side',
            'Illuminated-Yes','Illuminated-No','Illuminated-Unknown','Permanent Structure','Railroad Equipment',
            'Passing Train','Topography','Vegetation','Highway Vehicles','Other Obstruction','Not Obstructed',
            'Unknown Obstruction','Gates','Cantilever FLS','Standard FLS','Wig wags','Traffic Signals','Audible',
            'Crossbucks','Stop Signs','Watchman','Flagged','Other Warning','No Warning']
# load dataset
data = pd.read_csv('RevisedData.csv',usecols=col_names)
# split dataset between features and target variable
col_names.remove('Total Injured Form 55A')
data = data.dropna(axis=0,subset=col_names)
sortedData = data[col_names] # Features
targetVariables = data['Total Injured Form 55A'] # Target Variable

#sortedData.to_csv('Test.csv')

X_train, X_test, y_train, y_test = train_test_split(sortedData,
                                 targetVariables, test_size=0.2, 
                                 random_state=1) # 80% training and 20% test

ii = 0
y_test = y_test.tolist()
mod_y_test = y_test
for x in y_test:
    if x > 2:
        mod_y_test[ii] = 3
    ii += 1

ii = 0
y_train = y_train.tolist()
mod_y_train = y_train
for x in y_train:
    if x > 2:
        mod_y_train[ii] = 3
    ii += 1

# Create Decision Tree classifier object
clf = DecisionTreeClassifier()
# Train Decision Tree Classifier
clf = clf.fit(X_train,y_train)
#Predict the response for test dataset
y_pred = clf.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

# Generate image of decison tree
from sklearn.tree import export_graphviz
from six import StringIO  
from IPython.display import Image  
import pydotplus
dot_data = StringIO()
export_graphviz(clf, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True,max_depth = 7,
                feature_names = col_names)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png('C:\\Users\\andre\\OneDrive\\Documents\\CU Boulder\\Summer 2022\\CSCI 5502 - Data Mining\\Final Project\\Train_DecisionTree.png')
Image(graph.create_png())

from sklearn.metrics import classification_report, confusion_matrix

clf_cv_score = cross_val_score(clf, sortedData, targetVariables, cv = 10)

#Print out results
print("\n\n=== Results for Decision Tree === \n")
print("=== Confusion Matrix ===")
print(confusion_matrix(mod_y_test,y_pred))
print('\n')
print("=== Classification Report ===")
print(classification_report(mod_y_test, y_pred))
print('\n')
print("=== All AUC Scores ===")
print(clf_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Decision Tree: ", clf_cv_score.mean())

Decision Tree (without unknowns)

In [None]:
#Decision Tree Classification (Crossing Characterisitics)
# ============= Getting Rid of Rows with Unknowns ====================
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn import metrics
# identify attributes desired to study
col_names = ['Total Injured Form 55A','Train Speed','Public', 'Connected To Signal', 
            'Both Sides','Side of Vehicle Approach','Opposite Side of Vehicle Approach',
            'Illuminated-Yes','Illuminated-No','Permanent Structure','Railroad Equipment',
            'Passing Train','Topography','Vegetation','Highway Vehicles','Not Obstructed',
            'Gates','Cantilever FLS','Standard FLS','Wig wags','Traffic Signals','Audible',
            'Crossbucks','Stop Signs','Watchman','Flagged','No Warning']
# load dataset
data = pd.read_csv('RevisedData.csv',usecols=col_names)
# split dataset between features and target variable
col_names.remove('Total Injured Form 55A')
data = data.dropna(axis=0,subset=col_names)
sortedData = data[col_names] # Features
targetVariables = data['Total Injured Form 55A'] # Target Variable

#sortedData.to_csv('Test.csv')

X_train, X_test, y_train, y_test = train_test_split(sortedData,
                                 targetVariables, test_size=0.2, 
                                 random_state=1) # 80% training and 20% test

ii = 0
y_test = y_test.tolist()
mod_y_test = y_test
for x in y_test:
    if x > 2:
        mod_y_test[ii] = 3
    ii += 1

ii = 0
y_train = y_train.tolist()
mod_y_train = y_train
for x in y_train:
    if x > 2:
        mod_y_train[ii] = 3
    ii += 1

# Create Decision Tree classifier object
clf = DecisionTreeClassifier()
# Train Decision Tree Classifier
clf = clf.fit(X_train,y_train)
#Predict the response for test dataset
y_pred = clf.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

# Generate image of decison tree
from sklearn.tree import export_graphviz
from six import StringIO  
from IPython.display import Image  
import pydotplus
dot_data = StringIO()
export_graphviz(clf, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True,max_depth = 7,
                feature_names = col_names)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png('C:\\Users\\andre\\OneDrive\\Documents\\CU Boulder\\Summer 2022\\CSCI 5502 - Data Mining\\Final Project\\Train_DecisionTree_NoUnknowns.png')
Image(graph.create_png())

from sklearn.metrics import classification_report, confusion_matrix

clf_cv_score = cross_val_score(clf, sortedData, targetVariables, cv = 10)

#Print out results
print("\n\n=== Results for Decision Tree === \n")
print("=== Confusion Matrix ===")
print(confusion_matrix(mod_y_test,y_pred))
print('\n')
print("=== Classification Report ===")
print(classification_report(mod_y_test, y_pred))
print('\n')
print("=== All AUC Scores ===")
print(clf_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Decision Tree: ", clf_cv_score.mean())

Random Forest (with unknown)

In [None]:
#Decision Tree Classification (Crossing Characterisitics)
# ============= With Unknowns ====================

import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier

# identify attributes desired to study
col_names = ['Total Injured Form 55A','Train Speed','Public','Private',
            'Connected To Signal', 'Not Connected To Signal','Unknown If Connected To Signal',
            'Both Sides','Side of Vehicle Approach','Opposite Side of Vehicle Approach','Unknown Side',
            'Illuminated-Yes','Illuminated-No','Illuminated-Unknown','Permanent Structure','Railroad Equipment',
            'Passing Train','Topography','Vegetation','Highway Vehicles','Other Obstruction','Not Obstructed',
            'Unknown Obstruction','Gates','Cantilever FLS','Standard FLS','Wig wags','Traffic Signals','Audible',
            'Crossbucks','Stop Signs','Watchman','Flagged','Other Warning','No Warning']
# load dataset
data = pd.read_csv('RevisedData.csv',usecols=col_names)
# split dataset between features and target variable
col_names.remove('Total Injured Form 55A')
data = data.dropna(axis=0,subset=col_names)
X = data[col_names] # Features
y = data['Total Injured Form 55A'] # Target Variable

X_train, X_test, y_train, y_test = train_test_split(X,
                                 y, test_size=0.2, 
                                 random_state=1) # 80% training and 20% test

#Create Random Forest Model
rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)

#Prediction
rfc_predict = rfc.predict(X_test)
print(rfc_predict.sum())

ii = 0
y_test = y_test.tolist()
mod_y_test = y_test
mod_rfc_predict = rfc_predict
for x in rfc_predict:
    if x > 2:
        mod_rfc_predict[ii] = 3
    if y_test[ii] > 2:
        mod_y_test[ii] = 3
    ii += 1

from sklearn.metrics import classification_report, confusion_matrix

rfc_cv_score = cross_val_score(rfc, X, y, cv = 10)

#Print out results
print("\n\n=== Results for Random Forest === \n")
print("=== Confusion Matrix ===")
print(confusion_matrix(mod_y_test,mod_rfc_predict))
print('\n')
print("=== Classification Report ===")
print(classification_report(mod_y_test, mod_rfc_predict))
print('\n')
print("=== All AUC Scores ===")
print(rfc_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Random Forest: ", rfc_cv_score.mean())

Random Forest (without unknown)

In [None]:
#Decision Tree Classification (Crossing Characterisitics)
# ============= Without Unknowns ====================

import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier

# identify attributes desired to study
col_names = ['Total Injured Form 55A','Train Speed','Public','Private',
            'Connected To Signal', 'Not Connected To Signal',
            'Both Sides','Side of Vehicle Approach','Opposite Side of Vehicle Approach',
            'Illuminated-Yes','Illuminated-No','Permanent Structure','Railroad Equipment',
            'Passing Train','Topography','Vegetation','Highway Vehicles','Not Obstructed',
            'Gates','Cantilever FLS','Standard FLS','Wig wags','Traffic Signals','Audible',
            'Crossbucks','Stop Signs','Watchman','Flagged','No Warning']
# load dataset
data = pd.read_csv('RevisedData.csv',usecols=col_names)
# split dataset between features and target variable
col_names.remove('Total Injured Form 55A')
data = data.dropna(axis=0,subset=col_names)
X = data[col_names] # Features
y = data['Total Injured Form 55A'] # Target Variable

X_train, X_test, y_train, y_test = train_test_split(X,
                                 y, test_size=0.2, 
                                 random_state=1) # 80% training and 20% test

#Create Random Forest Model
rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)

#Prediction
rfc_predict = rfc.predict(X_test)
print(rfc_predict.sum())

ii = 0
y_test = y_test.tolist()
mod_y_test = y_test
mod_rfc_predict = rfc_predict
for x in rfc_predict:
    if x > 2:
        mod_rfc_predict[ii] = 3
    if y_test[ii] > 2:
        mod_y_test[ii] = 3
    ii += 1

from sklearn.metrics import classification_report, confusion_matrix

rfc_cv_score = cross_val_score(rfc, X, y, cv = 10)

#Print out results
print("\n\n=== Results for Random Forest === \n")
print("=== Confusion Matrix ===")
print(confusion_matrix(mod_y_test,mod_rfc_predict))
print('\n')
print("=== Classification Report ===")
print(classification_report(mod_y_test, mod_rfc_predict))
print('\n')
print("=== All AUC Scores ===")
print(rfc_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Random Forest: ", rfc_cv_score.mean())