In [1]:
# loading libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# load dataset
path = 'C:\\Users\\rivas\\OneDrive\\Documents\\JMR\\Education\\Springboard\\Projects\\Capstone2\\'
int_fnm = path + 'data\\compass\\compas-scores-raw.csv'
df = pd.read_csv(int_fnm)

# initial EDA
df.shape

(60843, 28)

In [2]:
# Data Cleanup

#  update 'Ethnic_Code_Text' to have conistent values for African Americans
print(df.Ethnic_Code_Text.unique())
df.loc[df['Ethnic_Code_Text'] == 'African-Am', 'Ethnic_Code_Text'] = 'African-American'
print(pd.value_counts(df['Ethnic_Code_Text']))

['Caucasian' 'African-American' 'Hispanic' 'Other' 'Asian' 'African-Am'
 'Native American' 'Oriental' 'Arabic']
African-American    27069
Caucasian           21783
Hispanic             8742
Other                2592
Asian                 324
Native American       219
Arabic                 75
Oriental               39
Name: Ethnic_Code_Text, dtype: int64


In [3]:
# DecileScore should be between 1 & 10, delete otherwise
df.DecileScore.unique()
print((df['DecileScore'] < 1).sum())
# remove DecileScore < 1
df = df[df.DecileScore >= 1]
(df['DecileScore'] < 1).sum()
print(pd.value_counts(df['DecileScore']))

45
1     18465
2      9192
3      8492
4      5338
5      4831
6      4319
7      3338
8      2799
9      2386
10     1638
Name: DecileScore, dtype: int64


In [4]:
# Slice by 'DisplayText' for Risk
RiskAppear = df.loc[df['DisplayText'] == 'Risk of Failure to Appear']
RiskViolence = df.loc[df['DisplayText'] == 'Risk of Violence']
RiskRecidivism = df.loc[df['DisplayText'] == 'Risk of Recidivism']
print('Appear:', RiskAppear.shape, ' Violence: ', RiskViolence.shape,  ' Recidivism:',RiskRecidivism.shape)

Appear: (20281, 28)  Violence:  (20272, 28)  Recidivism: (20245, 28)


In [5]:
####################################################################################
# Define prepare_data_for_regression_model:
def prepare_data_for_regression_model(dfx, target_loc):

    # Create new DataFrame of selected columns
    """
    Columns
    0 - 4  : 'Person_ID','AssessmentID','Case_ID','Agency_Text', 'LastName',
    5 - 9  : 'FirstName', 'MiddleName', 'Sex_Code_Text', 'Ethnic_Code_Text','DateOfBirth',
    10 - 14: 'ScaleSet_ID', 'ScaleSet', 'AssessmentReason','Language', 'LegalStatus',
    15 - 19: 'CustodyStatus', 'MaritalStatus','Screening_Date', 'RecSupervisionLevel', 'RecSupervisionLevelText',
    20 - 24: 'Scale_ID', 'DisplayText', 'RawScore', 'DecileScore', 'ScoreText',
    25 - 27: 'AssessmentType', 'IsCompleted', 'IsDeleted'
    """

    # slice for subset data features and targets
    df_new = dfx.iloc[:, [0,1,2,3,4,5,7,8,9,11,12,13,14,15,16,17,19]] #features
    y = dfx.iloc[:,target_loc] #target

    #  One-Hot encoder. It encodes the data into binary format
    le = preprocessing.LabelEncoder()
    Agency_Text_cat = le.fit_transform(df_new.Agency_Text)
    LastName_cat = le.fit_transform(df_new.LastName)
    FirstName_cat = le.fit_transform(df_new.FirstName)
    Sex_Code_Text_cat = le.fit_transform(df_new.Sex_Code_Text)
    Ethnic_Code_Text_cat = le.fit_transform(df_new.Ethnic_Code_Text)
    DateOfBirth_cat = le.fit_transform(df_new.DateOfBirth)
    ScaleSet_cat = le.fit_transform(df_new.ScaleSet)
    AssessmentReason_cat = le.fit_transform(df_new.AssessmentReason)
    Language_cat = le.fit_transform(df_new.Language)
    LegalStatus_cat = le.fit_transform(df_new.LegalStatus)
    CustodyStatus_cat = le.fit_transform(df_new.CustodyStatus)
    MaritalStatus_cat = le.fit_transform(df_new.MaritalStatus)
    Screening_Date_cat = le.fit_transform(df_new.Screening_Date)
    RecSupervisionLevelText_cat = le.fit_transform(df_new.RecSupervisionLevelText) 

    X_rev = df_new
    X_rev['Agency_Text_cat'] = Agency_Text_cat
    X_rev['LastName_cat'] = LastName_cat
    X_rev['FirstName_cat'] = FirstName_cat
    X_rev['Sex_Code_Text_cat'] = Sex_Code_Text_cat
    X_rev['Ethnic_Code_Text_cat'] = Ethnic_Code_Text_cat
    X_rev['DateOfBirth_cat'] = DateOfBirth_cat
    X_rev['ScaleSet_cat'] = ScaleSet_cat
    X_rev['AssessmentReason_cat'] = AssessmentReason_cat
    X_rev['Language_cat'] = Language_cat
    X_rev['LegalStatus_cat'] = LegalStatus_cat
    X_rev['CustodyStatus_cat'] = CustodyStatus_cat
    X_rev['MaritalStatus_cat'] = MaritalStatus_cat
    X_rev['Screening_Date_cat'] = Screening_Date_cat
    X_rev['RecSupervisionLevelText_cat'] = RecSupervisionLevelText_cat


    #drop the old categorical columns from dataframe
    dummy_fields = ['Agency_Text', 'LastName', 'FirstName', 'Sex_Code_Text',
                  'Ethnic_Code_Text', 'DateOfBirth', 'ScaleSet',
                  'AssessmentReason', 'Language', 'LegalStatus', 'CustodyStatus',
                  'MaritalStatus', 'Screening_Date', 'RecSupervisionLevelText' ]
    X_rev = X_rev.drop(dummy_fields, axis = 1)
    X_rev.columns

    # Standardizing
    X_std = StandardScaler().fit_transform(X_rev)


    # Split data train and test, first import library
    X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=0.2)
    print('Length for X_train:', len(X_train), ' X_test:',len(X_test), ' y_train:',len(y_train) ,' y_test:',len(y_test))

    return X_train, X_test, y_train, y_test



In [6]:
######
# Define linear_regression_model:
def linear_regression_model(dfx,dfnm, X_train, X_test, y_train, y_test, target):
    print('Running linear regression model for :', dfnm, ' using target: ', target)

    lm = LinearRegression()
    lm.fit(X_train, y_train)
    y_lm_pred = lm.predict(X_test)

    #Evaluate
    print('lm.score: ',lm.score(X_test, y_test))
    print(' ')
    print('first 10 predicted values: ',y_lm_pred[0:10])
    print('first 10 values of target: ')
    print(y_train.head(10))
    print(' ')

    print('mean of predicted of values: ',np.mean(y_lm_pred), ' STD of predicted of values : ', np.std(y_lm_pred) )
    print('describe of target')
    print(y_train.describe())

In [7]:
# LinearRegression

# Import libraries

# For preprocessing the data
from sklearn.preprocessing import Imputer
from sklearn import preprocessing
# Standardizing
from sklearn.preprocessing import StandardScaler
# To split the dataset into train and test datasets
from sklearn.model_selection import train_test_split
# To calculate the accuracy score of the model
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
# Linear Regression
from sklearn.linear_model import LinearRegression

"""
Columns
0 - 4  : 'Person_ID','AssessmentID','Case_ID','Agency_Text', 'LastName',
5 - 9  : 'FirstName', 'MiddleName', 'Sex_Code_Text', 'Ethnic_Code_Text','DateOfBirth',
10 - 14: 'ScaleSet_ID', 'ScaleSet', 'AssessmentReason','Language', 'LegalStatus',
15 - 19: 'CustodyStatus', 'MaritalStatus','Screening_Date', 'RecSupervisionLevel', 'RecSupervisionLevelText',
20 - 24: 'Scale_ID', 'DisplayText', 'RawScore', 'DecileScore', 'ScoreText',
25 - 27: 'AssessmentType', 'IsCompleted', 'IsDeleted'
RiskAppear = df.loc[df['DisplayText'] == 'Risk of Failure to Appear']
RiskViolence = df.loc[df['DisplayText'] == 'Risk of Violence']
RiskRecidivism = df.loc[df['DisplayText'] == 'Risk of Recidivism']
"""

In [9]:
# RiskAppear
X_train, X_test, y_train, y_test = prepare_data_for_regression_model(RiskAppear, 22)
linear_regression_model(RiskAppear,'RiskAppear', X_train, X_test, y_train, y_test, 'RawScore')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

Length for X_train: 16224  X_test: 4057  y_train: 16224  y_test: 4057
Running linear regression model for : RiskAppear  using target:  RawScore
lm.score:  0.12033069087084336
 
first 10 predicted values:  [18.30440886 18.25648485 18.27028714 18.19986398 16.93111757 20.28808422
 17.49305886 17.13248453 15.85898209 16.67252579]
first 10 values of target: 
2495     26.0
31166    15.0
7559     19.0
34085    16.0
20258    21.0
8954     20.0
5492     16.0
16505    11.0
53897    22.0
17075    24.0
Name: RawScore, dtype: float64
 
mean of predicted of values:  18.516545005653402  STD of predicted of values :  1.8552913898739534
describe of target
count    16224.000000
mean        18.528353
std          5.487013
min         11.000000
25%         14.000000
50%         17.000000
75%         22.000000
max         51.000000
Name: RawScore, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [10]:
# RiskViolence
X_train, X_test, y_train, y_test = prepare_data_for_regression_model(RiskViolence, 22)
linear_regression_model(RiskViolence,'RiskViolence', X_train, X_test, y_train, y_test, 'RawScore')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

Length for X_train: 16217  X_test: 4055  y_train: 16217  y_test: 4055
Running linear regression model for : RiskViolence  using target:  RawScore
lm.score:  0.3035364737217241
 
first 10 predicted values:  [-3.46316964 -1.78287782 -3.22033745 -3.41497973 -2.09105337 -2.0784668
 -2.20659107 -2.24494573 -1.26379528 -2.2948914 ]
first 10 values of target: 
52908   -2.57
38394   -2.56
39333   -3.00
57720   -1.92
51081   -3.60
34728   -1.60
28578   -2.37
37473   -1.98
15987   -3.26
56688   -4.16
Name: RawScore, dtype: float64
 
mean of predicted of values:  -2.529075696862802  STD of predicted of values :  0.49774427274333577
describe of target
count    16217.000000
mean        -2.521001
std          0.890216
min         -4.790000
25%         -3.180000
50%         -2.530000
75%         -1.890000
max          1.520000
Name: RawScore, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [11]:
# RiskRecidivism
X_train, X_test, y_train, y_test = prepare_data_for_regression_model(RiskRecidivism,22)
linear_regression_model(RiskRecidivism,'RiskRecidivism', X_train, X_test, y_train, y_test, 'RawScore')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

Length for X_train: 16196  X_test: 4049  y_train: 16196  y_test: 4049
Running linear regression model for : RiskRecidivism  using target:  RawScore
lm.score:  0.3583396459232592
 
first 10 predicted values:  [-1.06350478 -0.26338379 -1.37580727 -0.95914639 -0.87504597 -0.67935778
 -0.59492577 -0.81226281 -1.43489892 -1.01544237]
first 10 values of target: 
6988    -1.03
11707    0.13
54424   -1.93
30472   -0.05
45100    0.44
757     -0.69
52690   -0.73
7204    -0.78
59716   -0.24
49918   -0.49
Name: RawScore, dtype: float64
 
mean of predicted of values:  -0.7557637665563441  STD of predicted of values :  0.5031698458879453
describe of target
count    16196.000000
mean        -0.768313
std          0.851797
min         -3.050000
25%         -1.390000
50%         -0.760000
75%         -0.140000
max          2.360000
Name: RawScore, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
