# Data Science Capstone Week 1

In [139]:
!conda install -c conda-forge python-graphviz -y

Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 4.8.3
  latest version: 4.8.4

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs:
    - python-graphviz


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    openssl-1.1.1g             |       h516909a_1         2.1 MB  conda-forge
    python-graphviz-0.14.1     |     pyh9f0ad1d_0          19 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.1 MB

The following NEW packages will be INSTALLED:

  python-graphviz    conda-forge/noarch::python-graphviz-0.14.1-pyh9f0ad1d_0

The following packages will be UPDATED:

  openssl                                 1.1.1g-h516909a_

In [140]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.model_selection import train_test_split
from sklearn import preprocessing, metrics
import matplotlib.pyplot as plt
import graphviz
import pydotplus
%matplotlib inline

In [110]:
print("Hello Capstone Project Course!")

Hello Capstone Project Course!


# Capstone Week 2

In [111]:
df = pd.read_csv('Data-Collisions.csv')

## Drop features that have no impact on outcome, such as arbitrary report numbers, etc.

In [112]:
df.drop(['OBJECTID', 'INCKEY', 'INTKEY', 'COLDETKEY', 'REPORTNO', 'EXCEPTRSNCODE', 'EXCEPTRSNDESC', 'INCDATE', 'SEVERITYCODE.1', 'SEVERITYDESC', 'X', 'Y'], 1, inplace = True)

In [113]:
df.shape

(194673, 26)

## Check datatypes of imported data

In [114]:
df.dtypes

SEVERITYCODE        int64
STATUS             object
ADDRTYPE           object
LOCATION           object
COLLISIONTYPE      object
PERSONCOUNT         int64
PEDCOUNT            int64
PEDCYLCOUNT         int64
VEHCOUNT            int64
INCDTTM            object
JUNCTIONTYPE       object
SDOT_COLCODE        int64
SDOT_COLDESC       object
INATTENTIONIND     object
UNDERINFL          object
WEATHER            object
ROADCOND           object
LIGHTCOND          object
PEDROWNOTGRNT      object
SDOTCOLNUM        float64
SPEEDING           object
ST_COLCODE         object
ST_COLDESC         object
SEGLANEKEY          int64
CROSSWALKKEY        int64
HITPARKEDCAR       object
dtype: object

## Preprocessing - Review each feature and fill in the NA's.

In [115]:
df['SPEEDING'].fillna('N', inplace =True)
df['INATTENTIONIND'].fillna('N', inplace =True)
df['ROADCOND'].fillna('Unknown', inplace = True)
df['LIGHTCOND'].fillna('Unknown', inplace = True)
df['PEDROWNOTGRNT'].fillna('N', inplace = True)
df['SDOTCOLNUM'].fillna(0, inplace = True)
df['ST_COLCODE'].fillna(31, inplace = True)
df['JUNCTIONTYPE'].fillna('Unknown', inplace = True)
df['ADDRTYPE'].fillna('Block', inplace = True)
df['WEATHER'].fillna('Other', inplace = True)
df['UNDERINFL'].replace({'N':'0', 'Y':'1'}, inplace = True)
df['COLLISIONTYPE'].fillna('Other', inplace = True)

## Filling in the remaining NA's with '0'

In [116]:
df.fillna(-99999, inplace = True)

## Checking how many more features have NA's

In [117]:
df.isnull().sum(axis = 0)

SEVERITYCODE      0
STATUS            0
ADDRTYPE          0
LOCATION          0
COLLISIONTYPE     0
PERSONCOUNT       0
PEDCOUNT          0
PEDCYLCOUNT       0
VEHCOUNT          0
INCDTTM           0
JUNCTIONTYPE      0
SDOT_COLCODE      0
SDOT_COLDESC      0
INATTENTIONIND    0
UNDERINFL         0
WEATHER           0
ROADCOND          0
LIGHTCOND         0
PEDROWNOTGRNT     0
SDOTCOLNUM        0
SPEEDING          0
ST_COLCODE        0
ST_COLDESC        0
SEGLANEKEY        0
CROSSWALKKEY      0
HITPARKEDCAR      0
dtype: int64

## Handling the non-numerical data

In [118]:
def handle_non_numerical_data(df):
    columns = df.columns.values
    
    for column in columns:
        text_digit_vals = {}
        def convert_to_int(val):
            return text_digit_vals[val]
        
        if df[column].dtype != np.int64 and df[column].dtype != np.float64:
            column_contents = df[column].values.tolist()
            unique_elements = set(column_contents)
            x = 0
            for unique in unique_elements:
                if unique not in text_digit_vals:
                    text_digit_vals[unique] = x
                    x += 1
            df[column] = list(map(convert_to_int, df[column]))
    return df
df = handle_non_numerical_data(df)
df.head

<bound method NDFrame.head of         SEVERITYCODE  STATUS  ADDRTYPE  LOCATION  COLLISIONTYPE  PERSONCOUNT  \
0                  2       0         2     20407              8            2   
1                  1       0         1      4802              1            2   
2                  1       0         1      2921              5            4   
3                  1       0         1      7967              0            3   
4                  2       0         2      5565              8            2   
...              ...     ...       ...       ...            ...          ...   
194668             2       0         1     10705              9            3   
194669             1       0         1     17135              7            2   
194670             2       0         2     19032              2            3   
194671             2       0         2     20600              4            2   
194672             1       0         1       873              7            2   

        P

In [119]:
df['SEVERITYCODE'].value_counts() #Even though the metadata description shows 5 codes to classify severity, how many are actually present in the data?

1    136485
2     58188
Name: SEVERITYCODE, dtype: int64

In [120]:
df.columns

Index(['SEVERITYCODE', 'STATUS', 'ADDRTYPE', 'LOCATION', 'COLLISIONTYPE',
       'PERSONCOUNT', 'PEDCOUNT', 'PEDCYLCOUNT', 'VEHCOUNT', 'INCDTTM',
       'JUNCTIONTYPE', 'SDOT_COLCODE', 'SDOT_COLDESC', 'INATTENTIONIND',
       'UNDERINFL', 'WEATHER', 'ROADCOND', 'LIGHTCOND', 'PEDROWNOTGRNT',
       'SDOTCOLNUM', 'SPEEDING', 'ST_COLCODE', 'ST_COLDESC', 'SEGLANEKEY',
       'CROSSWALKKEY', 'HITPARKEDCAR'],
      dtype='object')

## Setting up the Decision Tree Classifier

In [131]:
X = df.drop(['SEVERITYCODE'], 1).values
'''X = df[['ADDRTYPE', 'STATUS', 'COLLISIONTYPE', 'PERSONCOUNT', 'PEDCOUNT',
       'PEDCYLCOUNT', 'VEHCOUNT', 'INCDTTM', 'JUNCTIONTYPE', 'SDOT_COLCODE',
       'SDOT_COLDESC', 'INATTENTIONIND', 'UNDERINFL', 'WEATHER', 'ROADCOND',
       'LIGHTCOND', 'PEDROWNOTGRNT', 'SDOTCOLNUM', 'SPEEDING', 'ST_COLCODE',
       'ST_COLDESC', 'SEGLANEKEY', 'CROSSWALKKEY', 'HITPARKEDCAR']].values'''
y = df['SEVERITYCODE'].values

In [132]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 4)
X = preprocessing.StandardScaler().fit(X).transform

In [133]:
clf = DecisionTreeClassifier(criterion = "entropy")

In [135]:
clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [136]:
prediction = clf.predict(X_test)

In [137]:
print (prediction [0:50])
print (y_test [0:50])

[2 2 1 1 1 1 2 2 1 1 1 2 1 1 2 2 2 2 1 1 1 1 2 1 2 1 2 1 1 1 2 2 1 1 2 2 2
 1 1 1 2 1 1 1 1 1 1 2 2 1]
[2 1 1 2 2 1 2 2 2 1 1 2 1 1 1 1 1 1 2 1 2 1 1 1 2 1 1 1 1 1 2 1 1 2 2 2 1
 1 1 1 1 2 1 1 1 1 2 1 1 1]


In [138]:
accuracy = clf.score(X_test, y_test) 
accuracy

0.6888633950892092

In [None]:
from IPython.display import display
display(graphviz.Source(export_graphviz(clf)))