In [51]:
import pandas as pd
import logging

import numpy as np
import math

from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.datasets import make_regression
from sklearn.metrics import roc_auc_score
from sklearn import preprocessing

# from mla.datasets import *
# from mla.metrics.metrics import root_mean_squared_log_error, mean_squared_error
from mla.neuralnet import NeuralNet
from mla.neuralnet.constraints import MaxNorm, UnitNorm
from mla.neuralnet.layers import Activation, Dense, Dropout
from mla.neuralnet.optimizers import SGD, RMSprop, Adagrad, Adadelta, Adam
from mla.neuralnet.parameters import Parameters
from mla.neuralnet.regularizers import *
from mla.utils import one_hot

In [52]:
logging.basicConfig(level=logging.DEBUG)


def classification(X, y):
    ''' TO-DO: add in arg that is an example array,
    make this function print out the classification of the example '''
    
    y = one_hot(y)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=1111)

    model = NeuralNet(
        layers=[
            Dense(512, Parameters(init='uniform', regularizers={'W': L2(0.05)})),
            Activation('relu'),
            Dropout(0.9),
            Dense(128, Parameters(init='normal', constraints={'W': MaxNorm()})),
            Activation('relu'),
            Dense(3),
            Activation('softmax'),
        ],
        loss='categorical_crossentropy',
        optimizer=Adadelta(),
        metric='accuracy',
        batch_size=256,
        max_epochs=25,

    )
    print X_train.shape
    print y_train.shape
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    print('classification accuracy', roc_auc_score(y_test[:, 0], predictions[:, 0]))



In [53]:
def process_x_y(x_data,y_data):
    
    ''' convert the text columns of np.arrays of desired x_data and y_data into int / vector representation '''
    
    # convert text columns to integer values
    to_int = [0,2,7]
    for i in to_int:
        temp = x_data[:,i]
        temp_fit = le.fit(temp)
        x_data[:,i] = le.transform(temp)

    for i in range(len(x_data)):
        for j in range(len(x_data[i])):
            if np.isnan(x_data[i][j]):
                x_data[i][j] = 0

    x_data = x_data.astype(int)

    y_fit = le.fit(y_data)
    y_data = le.transform(y_data)
    return x_data, y_data

In [54]:
csv_file = 'compas-scores-two-years-violent.csv'
df = pd.read_csv(csv_file)

keep = [
 'sex',
 'age',
 'race',
 'juv_fel_count',
 'juv_misd_count',
 'juv_other_count',
 'priors_count',
 'c_charge_degree']

target = ['v_score_text']

le = preprocessing.LabelEncoder()
y = df[target].as_matrix()
x = df[keep].as_matrix()

x_data, y_data = process_x_y(x,y)

print x_data.shape
print y_data.shape

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.15, random_state=1111)



(4743, 8)
(4743,)


In [55]:
classification(x_data, y_data)

INFO:root:Total parameters: 70659
Epoch progress:  44%|████▍     | 7/16 [00:00<00:00, 66.18it/s]

(4031, 8)
(4031, 3)


Epoch progress: 100%|██████████| 16/16 [00:00<00:00, 68.60it/s]
INFO:root:Epoch:0, train loss: 3.1602412218, train accuracy: 0.71619945423, elapsed: 0.268970012665 sec.
Epoch progress: 100%|██████████| 16/16 [00:00<00:00, 70.56it/s]
INFO:root:Epoch:1, train loss: 2.04212005224, train accuracy: 0.71743984123, elapsed: 0.261106967926 sec.
Epoch progress: 100%|██████████| 16/16 [00:00<00:00, 70.78it/s]
INFO:root:Epoch:2, train loss: 1.67225495332, train accuracy: 0.71743984123, elapsed: 0.260575056076 sec.
Epoch progress: 100%|██████████| 16/16 [00:00<00:00, 69.53it/s]
INFO:root:Epoch:3, train loss: 1.46102365802, train accuracy: 0.719176383032, elapsed: 0.265741109848 sec.
Epoch progress: 100%|██████████| 16/16 [00:00<00:00, 68.83it/s]
INFO:root:Epoch:4, train loss: 1.30145800432, train accuracy: 0.720168692632, elapsed: 0.267425060272 sec.
Epoch progress: 100%|██████████| 16/16 [00:00<00:00, 66.27it/s]
INFO:root:Epoch:5, train loss: 1.22149079889, train accuracy: 0.720664847432, elapsed

('classification accuracy', 0.83946864797928633)


In [None]:
''' 

For calculating your risk:

[
 'sex',
 'age',
 'race',
 'juv_fel_count',
 'juv_misd_count',
 'juv_other_count',
 'priors_count',
 'c_charge_degree'
]

'''


''' TO-DO:
        Find out what options for each cell are - get the set of scores for each column
        
        Build in option for people to find out their risk score!
'''