In [None]:
# Compass using tensorflow

In [1]:
# loading libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# For preprocessing the data
from sklearn.preprocessing import Imputer
from sklearn import preprocessing
# Standardizing
from sklearn.preprocessing import StandardScaler
# To split the dataset into train and test datasets
from sklearn.model_selection import train_test_split
# To calculate the accuracy score of the model
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
#
from datetime import datetime
from datetime import date
#
import collections
#
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [11]:
# load dataset

path = 'C:\\Users\\rivas\\OneDrive\\Documents\\JMR\\Education\\Springboard\\Projects\\Capstone2\\'
int_fnm = path + 'data\\compass\\compas-scores-raw.csv'
df = pd.read_csv(int_fnm)

In [12]:
# Review data and Manipulate Data

# update 'Ethnic_Code_Text' to have conistent values for African Americans
df.loc[df['Ethnic_Code_Text'] == 'African-Am', 'Ethnic_Code_Text'] = 'African-American'
print(pd.value_counts(df['Ethnic_Code_Text']))

African-American    27069
Caucasian           21783
Hispanic             8742
Other                2592
Asian                 324
Native American       219
Arabic                 75
Oriental               39
Name: Ethnic_Code_Text, dtype: int64


In [13]:
# DecileScore should be between 1 & 10, delete otherwise
df.DecileScore.unique()
print((df['DecileScore'] < 1).sum())
# remove DecileScore < 1
df = df[df.DecileScore >= 1]
(df['DecileScore'] < 1).sum()
print(pd.value_counts(df['DecileScore']))

45
1     18465
2      9192
3      8492
4      5338
5      4831
6      4319
7      3338
8      2799
9      2386
10     1638
Name: DecileScore, dtype: int64


In [14]:
# Add column 'Age' from DateofBirth
agelist = []
currdate = date.today()
for dte in df['DateOfBirth']:
    brthdte = datetime.strptime(dte, '%m/%d/%y')
    mnthday = (currdate.month, currdate.day) < (brthdte.month, brthdte.day)
    if currdate.year > brthdte.year:
        agelist.append(currdate.year - brthdte.year - (mnthday))
    else:
        agelist.append(-1)
    

print(len(agelist), len(df))
df['Age'] = agelist
print(df.columns)

60798 60798
Index(['Person_ID', 'AssessmentID', 'Case_ID', 'Agency_Text', 'LastName',
       'FirstName', 'MiddleName', 'Sex_Code_Text', 'Ethnic_Code_Text',
       'DateOfBirth', 'ScaleSet_ID', 'ScaleSet', 'AssessmentReason',
       'Language', 'LegalStatus', 'CustodyStatus', 'MaritalStatus',
       'Screening_Date', 'RecSupervisionLevel', 'RecSupervisionLevelText',
       'Scale_ID', 'DisplayText', 'RawScore', 'DecileScore', 'ScoreText',
       'AssessmentType', 'IsCompleted', 'IsDeleted', 'Age'],
      dtype='object')


In [15]:
# cleanup bad Ages
# remove Ages < 1
(df['Age'] < 1).sum()

df = df[df.Age >= 1]
(df['Age'] < 1).sum()

0

In [16]:
# Slice by 'DisplayText' for Risk
RiskAppear = df.loc[df['DisplayText'] == 'Risk of Failure to Appear']
RiskViolence = df.loc[df['DisplayText'] == 'Risk of Violence']
RiskRecidivism = df.loc[df['DisplayText'] == 'Risk of Recidivism']
print('Appear:', RiskAppear.shape, ' Violence: ', RiskViolence.shape,  ' Recidivism:',RiskRecidivism.shape)

Appear: (16016, 29)  Violence:  (16010, 29)  Recidivism: (15990, 29)


In [17]:
# Define prepare_data_for_ml_model_1:
def prepare_data_for_ml_model_1(dfx, target_loc):
    # Create new Dataset of selected columns to get prepare TEST and Training data for  ML model 
     
    """
    Columns
    0 - 4  : 'Person_ID','AssessmentID','Case_ID','Agency_Text', 'LastName',
    5 - 9  : 'FirstName', 'MiddleName', 'Sex_Code_Text', 'Ethnic_Code_Text','DateOfBirth',
    10 - 14: 'ScaleSet_ID', 'ScaleSet', 'AssessmentReason','Language', 'LegalStatus',
    15 - 19: 'CustodyStatus', 'MaritalStatus','Screening_Date', 'RecSupervisionLevel', 'RecSupervisionLevelText',
    20 - 24: 'Scale_ID', 'DisplayText', 'RawScore', 'DecileScore', 'ScoreText',
    25 - 28: 'AssessmentType', 'IsCompleted', 'IsDeleted','Age'
    """

    x_df = dfx.iloc[:, [7,8,14,15,16,19]] #features
    tmp_age = dfx.iloc[:,28].as_matrix() #age feature, convert numpy array
    x_age = tmp_age.reshape(tmp_age.size,1)
    

    y = dfx.iloc[:,target_loc].as_matrix() #target convert numpy array


    #  lable encoder. It encodes the data into integers
    le = LabelEncoder()

    Sex_Code_Text_cat = le.fit_transform(x_df.Sex_Code_Text)
    Ethnic_Code_Text_cat = le.fit_transform(x_df.Ethnic_Code_Text)
    LegalStatus_cat = le.fit_transform(x_df.LegalStatus)
    CustodyStatus_cat = le.fit_transform(x_df.CustodyStatus)
    MaritalStatus_cat = le.fit_transform(x_df.MaritalStatus)
    RecSupervisionLevelText_cat = le.fit_transform(x_df.RecSupervisionLevelText)

    Sex_Code_Text_cat = Sex_Code_Text_cat.reshape(len(Sex_Code_Text_cat),1)
    Ethnic_Code_Text_cat = Ethnic_Code_Text_cat.reshape(len(Ethnic_Code_Text_cat),1)
    LegalStatus_cat = LegalStatus_cat.reshape(len(LegalStatus_cat),1)
    CustodyStatus_cat = CustodyStatus_cat.reshape(len(CustodyStatus_cat),1)
    MaritalStatus_cat = MaritalStatus_cat.reshape(len(MaritalStatus_cat),1)
    RecSupervisionLevelText_cat = RecSupervisionLevelText_cat.reshape(len(RecSupervisionLevelText_cat),1)

#  One-Hot encoder. It encodes the data into binary format
    onehote = OneHotEncoder(sparse=False)
    
    Sex_Code_Text_oh = onehote.fit_transform(Sex_Code_Text_cat)
    Ethnic_Code_Text_oh = onehote.fit_transform(Ethnic_Code_Text_cat)
    LegalStatus_oh = onehote.fit_transform(LegalStatus_cat)
    CustodyStatus_oh = onehote.fit_transform(CustodyStatus_cat)
    MaritalStatus_oh = onehote.fit_transform(MaritalStatus_cat)
    RecSupervisionLevelText_oh = onehote.fit_transform(RecSupervisionLevelText_cat)

# Build out feature dataset as numpy array, since One-Hot encoder creates numpy array
    X_feature =  Sex_Code_Text_oh
    X_feature = np.concatenate((X_feature,Ethnic_Code_Text_oh), axis=1)
    X_feature = np.concatenate((X_feature,LegalStatus_oh), axis=1)
    X_feature = np.concatenate((X_feature,CustodyStatus_oh), axis=1)
    X_feature = np.concatenate((X_feature,MaritalStatus_oh), axis=1)
    X_feature = np.concatenate((X_feature,RecSupervisionLevelText_oh), axis=1)
    X_feature = np.concatenate((X_feature,x_age), axis=1)

# Split data train and test
    X_train, X_test, y_train, y_test = train_test_split(X_feature, y, test_size=0.2)
    print('Length for X_train:', len(X_train), ' X_test:',len(X_test), ' y_train:',len(y_train) ,' y_test:',len(y_test))

    return X_train, X_test, y_train, y_test
    

In [18]:
# Define ml_model_1:
# takes in model Instantiate model (model)
# fits, predicts, and evaluates (prints results)
def ml_model_1(model, modelnm, dfnm, X_train, X_test, y_train, y_test, target):
    print('Running ', modelnm, ' model for :', dfnm, ' using target: ', target)

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    #Evaluate
    print(modelnm,' score: ',model.score(X_test, y_test))
    print(' ')
    print('first 10 predicted values: ',y_pred[0:10])
    print('first 10 values of target: ')
    print(y_test[0:10])
    print(' ')

    print('mean of predicted values: ',np.mean(y_pred), ' STD of predicted values : ', np.std(y_pred) )
    print('mean of Target  values: ',np.mean(y_pred), ' STD of predicted  values : ', np.std(y_pred) )
    return y_pred

In [19]:
class dataReader(object):

    def __init__(self,*arrays,batch_size=1):
        self.arrays = arrays
        self.__check_equal_shape()
        self.num_examples = self.arrays[0].shape[0]
        self.batch_number = 0
        self.batch_size = batch_size
        self.num_batches = int(np.ceil(self.num_examples / batch_size))

    def __check_equal_shape(self):
        if any(self.arrays[0].shape[0] != arr.shape[0] for arr in self.arrays[1:]):
            raise ValueError("all arrays must be equal along first dimension")

    def next_batch(self):
        low_ix = self.batch_number*self.batch_size
        up_ix = (self.batch_number + 1)*self.batch_size
        if up_ix >= self.num_examples:
            up_ix = self.num_examples
            self.batch_number = 0 # reset batch_number to zero
        else:
            self.batch_number = self.batch_number + 1

        return [arr[low_ix:up_ix,:] for arr in self.arrays]

In [20]:
#  Tensorflow  Implementation #page 422

import tensorflow as tf
import os
import sys
from functools import partial
from sklearn.preprocessing import StandardScaler

In [21]:
# to make this notebook's output stable across runs
def reset_graph(seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)

In [22]:
# RiskRecidivism dataset target RawScore (22)
# X_train.shape (12792, 35)
X_train, X_test, y_train, y_test = prepare_data_for_ml_model_1(RiskRecidivism,22)

Length for X_train: 12792  X_test: 3198  y_train: 12792  y_test: 3198


In [23]:
print(X_train.shape)

(12792, 35)


In [24]:
scaler = StandardScaler()
X_scaler = scaler.fit_transform(X_train)
X = tf.constant(X_scaler, dtype=tf.float32, name="X")

n_inputs = 35 # 28 * 28
n_hidden1 = 300
n_hidden2 = 150  
n_hidden3 = n_hidden1
n_outputs = n_inputs

learning_rate = 0.01
l2_reg = 0.0001

he_init = tf.contrib.layers.variance_scaling_initializer() # He initialization
#Equivalent to:
#he_init = lambda shape, dtype=tf.float32: tf.truncated_normal(shape, 0., stddev=np.sqrt(2/shape[0]))
l2_regularizer = tf.contrib.layers.l2_regularizer(l2_reg)
my_dense_layer = partial(tf.layers.dense,
                         activation=tf.nn.elu,
                         kernel_initializer=he_init,
                         kernel_regularizer=l2_regularizer)

hidden1 = my_dense_layer(X, n_hidden1)
hidden2 = my_dense_layer(hidden1, n_hidden2)
hidden3 = my_dense_layer(hidden2, n_hidden3)
outputs = my_dense_layer(hidden3, n_outputs, activation=None)

reconstruction_loss = tf.reduce_mean(tf.square(outputs - X))

reg_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
loss = tf.add_n([reconstruction_loss] + reg_losses)

optimizer = tf.train.AdamOptimizer(learning_rate)
training_op = optimizer.minimize(loss)

init = tf.global_variables_initializer()
saver = tf.train.Saver()

  from ._conv import register_converters as _register_converters


In [25]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(12792, 35) (12792,) (3198, 35) (3198,)


In [26]:
# instatianate
data_reader = dataReader(X_train,y_train, batch_size=150)

In [27]:
print(data_reader.num_batches)


86


In [28]:
n_epochs = 5
batch_size = 150

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        #n_batches = mnist.train.num_examples // batch_size
        n_batches = data_reader.num_batches
        for iteration in range(n_batches):
            print("\r{}%".format(100 * iteration // n_batches), end="")
            sys.stdout.flush()                                          
            X_batch, y_batch = data_reader.next_batch(batch_size)
            sess.run(training_op, feed_dict={X: X_batch})
        loss_train = reconstruction_loss.eval(feed_dict={X: X_batch})   
        print("\r{}".format(epoch), "Train MSE:", loss_train)           
        saver.save(sess, "./my_model_all_layers.ckpt") 

0%

TypeError: next_batch() takes 1 positional argument but 2 were given