<a href="https://colab.research.google.com/github/simonarahi/MachineLearning-TensorFlow/blob/master/latest_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Simona Rahi
# Loading Data and Classifying 

In [0]:
import tensorflow as tf
from tensorflow import keras
import functools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics

Loading test and train data (each is 50% of data)

In [2]:
data_URL = "https://raw.githubusercontent.com/simonarahi/MachineLearning-TensorFlow/master/nassCDS.csv"

train_data_URL = "https://raw.githubusercontent.com/simonarahi/MachineLearning-TensorFlow/master/train.csv"
test_data_URL = "https://raw.githubusercontent.com/simonarahi/MachineLearning-TensorFlow/master/test.csv"

train_file_path = tf.keras.utils.get_file("train.csv", train_data_URL)
test_file_path = tf.keras.utils.get_file("test.csv", test_data_URL)

Downloading data from https://raw.githubusercontent.com/simonarahi/MachineLearning-TensorFlow/master/train.csv
Downloading data from https://raw.githubusercontent.com/simonarahi/MachineLearning-TensorFlow/master/test.csv


In [0]:
np.set_printoptions(precision=3, suppress=True)

Inspecting the data

In [4]:
df = pd.read_csv(train_file_path)
df.head()
#df['injSeverity'].min()
#df['injSeverity'].max()

Unnamed: 0.1,Unnamed: 0,dvcat,weight,dead,airbag,seatbelt,frontal,sex,ageOFocc,yearacc,yearVeh,abcat,occRole,deploy,injSeverity,caseid
0,1,25-39,25.069,alive,none,belted,1,f,26,1997,1990.0,unavail,driver,0,3.0,2:3:1
1,3,10-24,32.379,alive,none,none,1,f,69,1997,1988.0,unavail,driver,0,4.0,2:5:1
2,6,40-54,25.069,alive,none,belted,1,f,22,1997,1985.0,unavail,driver,0,3.0,2:11:2
3,7,55+,27.078,alive,none,belted,1,m,22,1997,1984.0,unavail,driver,0,3.0,2:13:1
4,9,10-24,812.869,alive,none,belted,0,m,40,1997,1984.0,unavail,driver,0,1.0,2:14:1


Specifying column to be classified

In [0]:
LABEL_COLUMN = 'injSeverity'
LABELS = [0, 1, 2, 3, 4, 5, 6]

Read csv data from file and create dataset

In [0]:
def get_dataset(file_path, **kwargs):
  dataset = tf.data.experimental.make_csv_dataset(
      file_path,
      batch_size=5, # Artificially small to make examples easier to show.
      label_name=LABEL_COLUMN,
      na_value="?",
      select_columns = ['dvcat', 'weight', 'dead', 'airbag', 'seatbelt', 'frontal', 'sex', 'ageOFocc', 'yearacc', 'yearVeh', 'abcat', 'occRole', 'deploy', 'injSeverity'],
      num_epochs=1,
      ignore_errors=True, 
      **kwargs)
  return dataset

raw_train_data = get_dataset(train_file_path)
raw_test_data = get_dataset(test_file_path)

In [8]:
def show_batch(dataset):
  for batch, label in dataset.take(1):
    for key, value in batch.items():
      print("{:20s}: {}".format(key,value.numpy()))

show_batch(raw_train_data)

dvcat               : [b'25-39' b'25-39' b'10-24' b'10-24' b'10-24']
weight              : [ 116.202    3.028   10.31  2089.741   30.29 ]
dead                : [b'dead' b'alive' b'alive' b'alive' b'alive']
airbag              : [b'none' b'none' b'airbag' b'none' b'none']
seatbelt            : [b'none' b'belted' b'belted' b'none' b'none']
frontal             : [1 1 0 0 1]
sex                 : [b'm' b'f' b'f' b'm' b'm']
ageOFocc            : [25 44 54 32 19]
yearacc             : [2000 1997 2000 1998 1997]
yearVeh             : [1991 1980 1998 1985 1975]
abcat               : [b'unavail' b'unavail' b'nodeploy' b'unavail' b'unavail']
occRole             : [b'driver' b'driver' b'driver' b'driver' b'driver']
deploy              : [0 0 0 0 0]


# **Data Preprocessing**


Since we have mixed data types, we will start by seperating the numeric features and pack them into one column.

In [0]:
class PackNumericFeatures(object):
  def __init__(self, names):
    self.names = names

  def __call__(self, features, labels):
    numeric_features = [features.pop(name) for name in self.names]
    numeric_features = [tf.cast(feat, tf.float32) for feat in numeric_features]
    numeric_features = tf.stack(numeric_features, axis=-1)
    features['numeric'] = numeric_features

    return features, labels

In [0]:
NUMERIC_FEATURES = ['weight','frontal','ageOFocc', 'yearacc', 'yearVeh', 'deploy']

packed_train_data = raw_train_data.map(
    PackNumericFeatures(NUMERIC_FEATURES))

packed_test_data = raw_test_data.map(
    PackNumericFeatures(NUMERIC_FEATURES))

In [11]:
show_batch(packed_train_data)

dvcat               : [b'25-39' b'25-39' b'25-39' b'10-24' b'25-39']
dead                : [b'alive' b'alive' b'alive' b'alive' b'alive']
airbag              : [b'airbag' b'airbag' b'airbag' b'none' b'none']
seatbelt            : [b'belted' b'belted' b'none' b'belted' b'belted']
sex                 : [b'f' b'f' b'f' b'm' b'f']
abcat               : [b'deploy' b'deploy' b'nodeploy' b'unavail' b'unavail']
occRole             : [b'driver' b'pass' b'pass' b'driver' b'pass']
numeric             : [[   2.787    0.      63.    1998.    1995.       1.   ]
 [  66.618    1.      39.    1997.    1996.       1.   ]
 [ 253.492    0.      48.    2001.    1991.       0.   ]
 [  21.428    1.      33.    1999.    1986.       0.   ]
 [  70.308    1.      80.    1998.    1990.       0.   ]]


In [0]:
example_batch, labels_batch = next(iter(packed_train_data))

Normalizing our continuous data

In [13]:
import pandas as pd
desc = pd.read_csv(train_file_path)[NUMERIC_FEATURES].describe()
desc

Unnamed: 0,weight,frontal,ageOFocc,yearacc,yearVeh,deploy
count,13109.0,13109.0,13109.0,13109.0,13108.0,13109.0
mean,469.76129,0.641315,37.173697,1999.555496,1992.813168,0.340835
std,1609.146959,0.479633,17.879764,1.702573,5.628077,0.474008
min,0.0,0.0,16.0,1997.0,1953.0,0.0
25%,33.185,0.0,22.0,1998.0,1989.0,0.0
50%,86.986,1.0,33.0,2000.0,1994.0,0.0
75%,362.13,1.0,48.0,2001.0,1997.0,1.0
max,57871.595,1.0,97.0,2002.0,2003.0,1.0


In [0]:
MEAN = np.array(desc.T['mean'])
STD = np.array(desc.T['std'])

def normalize_numeric_data(data, mean, std):
  # Center the data
  return (data-mean)/std

Creating a numeric column with the normalized data

In [15]:
normalizer = functools.partial(normalize_numeric_data, mean=MEAN, std=STD)

numeric_column = tf.feature_column.numeric_column('numeric', normalizer_fn=normalizer, shape=[len(NUMERIC_FEATURES)])
numeric_columns = [numeric_column]
numeric_column

NumericColumn(key='numeric', shape=(6,), default_value=None, dtype=tf.float32, normalizer_fn=functools.partial(<function normalize_numeric_data at 0x7fdca0137730>, mean=array([ 469.761,    0.641,   37.174, 1999.555, 1992.813,    0.341]), std=array([1609.147,    0.48 ,   17.88 ,    1.703,    5.628,    0.474])))

We will include this feature column 'numeric' in our training process 

In [16]:
example_batch['numeric']

<tf.Tensor: shape=(5, 6), dtype=float32, numpy=
array([[1437.958,    0.   ,   21.   , 1998.   , 1985.   ,    0.   ],
       [  94.226,    0.   ,   80.   , 1997.   , 1990.   ,    0.   ],
       [  18.319,    1.   ,   43.   , 1999.   , 1988.   ,    0.   ],
       [  29.757,    1.   ,   48.   , 2000.   , 1994.   ,    0.   ],
       [  10.917,    0.   ,   48.   , 1999.   , 1998.   ,    0.   ]],
      dtype=float32)>

In [17]:
numeric_layer = tf.keras.layers.DenseFeatures(numeric_columns)
numeric_layer(example_batch).numpy()

array([[ 0.602, -1.337, -0.905, -0.914, -1.388, -0.719],
       [-0.233, -1.337,  2.395, -1.501, -0.5  , -0.719],
       [-0.281,  0.748,  0.326, -0.326, -0.855, -0.719],
       [-0.273,  0.748,  0.606,  0.261,  0.211, -0.719],
       [-0.285, -1.337,  0.606, -0.326,  0.922, -0.719]], dtype=float32)

Now dealing with our categorical variables

In [0]:
CATEGORIES = {
    'dvcat': ['1-9km/h', '10-24', '25-39', '40-54', '55+'],
    'dead' : ['alive', 'dead'],
    'airbag' : ['airbag', 'none'],
    'seatbelt' : ['belted', 'none'],
    'sex' : ['f', 'm'],
    'abcat' : ['deploy', 'nodeploy', 'unavail'],
    'occRole' : ['driver', 'pass']
}

In [19]:
categorical_columns = []
for feature, vocab in CATEGORIES.items():
  cat_col = tf.feature_column.categorical_column_with_vocabulary_list(
        key=feature, vocabulary_list=vocab)
  categorical_columns.append(tf.feature_column.indicator_column(cat_col))

categorical_columns

[IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='dvcat', vocabulary_list=('1-9km/h', '10-24', '25-39', '40-54', '55+'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='dead', vocabulary_list=('alive', 'dead'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='airbag', vocabulary_list=('airbag', 'none'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='seatbelt', vocabulary_list=('belted', 'none'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='sex', vocabulary_list=('f', 'm'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='abcat', vocabulary_list=('deploy', 'nodeplo

This layer below will be part of the data processing input layer in our model

In [20]:
categorical_layer = tf.keras.layers.DenseFeatures(categorical_columns)
print(categorical_layer(example_batch).numpy()[0])

[0. 0. 1. 0. 1. 1. 0. 0. 0. 1. 0. 0. 1. 0. 0. 1. 1. 0.]


**Combined Preprocessing Layer:**
Adding the two feature column we created (continuous and categorical) to create an input layer that will extract and preprocess both input types

In [21]:
preprocessing_layer = tf.keras.layers.DenseFeatures(categorical_columns+numeric_columns)
print(preprocessing_layer(example_batch).numpy()[0])

[ 0.     0.     1.     0.     1.     1.     0.     0.     0.     1.
  0.     0.     0.602 -1.337 -0.905 -0.914 -1.388 -0.719  1.     0.
  0.     1.     1.     0.   ]


# **Build the Model**

In [0]:
model = tf.keras.Sequential([
  preprocessing_layer,
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(1),
])

model.compile(
    loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
    optimizer='adam',
    metrics=['accuracy'])

now we can start training

In [0]:
train_data = packed_train_data.shuffle(500)
test_data = packed_test_data

In [24]:
model.fit(train_data, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7fdca004be10>

Now that we trained our model, we can test the accuracy on the test set

In [0]:
test_loss, test_accuracy = model.evaluate(test_data)

print('\n\nTest Loss {}, Test Accuracy {}'.format(test_loss, test_accuracy))



Test Loss 0.0, Test Accuracy 0.21833525598049164


Test accuracy is almost 21.8% which is very low

In [31]:
predictions = model.predict(test_data)
print(predictions)

[[5.386e+09]
 [5.305e+09]
 [5.702e+09]
 ...
 [4.960e+09]
 [5.851e+09]
 [5.582e+09]]


Confusion Matrix

In [36]:
df1 = pd.read_csv(test_file_path)
cm = metrics.confusion_matrix(df1['injSeverity'], predictions)
print(cm)

ValueError: ignored