#### Correct Pipeline #1
##### Pipeline splits raw data into train and test at the beginning of data preparation
##### Pipeline uses an extract of the COMPAS dataset at datasets\compas-scores-two-years.csv

In [1]:
# All imports
import os
from pathlib import Path
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, KBinsDiscretizer, label_binarize
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
# Getting the project root
project_root = Path.cwd().parent.parent
print(project_root)

c:\Users\Shreya\OneDrive\Desktop\data_prep_issues


#### Importing Data Files

In [3]:
# Getting the raw data file
raw_data_file = os.path.join(project_root, "datasets", "compas-scores-two-years.csv")
raw_data = pd.read_csv(raw_data_file)
print(raw_data.head().to_string())
print("Shape:", raw_data.shape)

   id                name   first         last compas_screening_date   sex         dob  age          age_cat              race  juv_fel_count  decile_score  juv_misd_count  juv_other_count  priors_count  days_b_screening_arrest            c_jail_in           c_jail_out  c_case_number c_offense_date c_arrest_date  c_days_from_compas c_charge_degree                   c_charge_desc  is_recid  r_case_number r_charge_degree  r_days_from_arrest r_offense_date                r_charge_desc   r_jail_in  r_jail_out  violent_recid  is_violent_recid vr_case_number vr_charge_degree vr_offense_date               vr_charge_desc  type_of_assessment  decile_score.1 score_text screening_date v_type_of_assessment  v_decile_score v_score_text v_screening_date  in_custody out_custody  priors_count.1  start   end  event  two_year_recid
0   1    miguel hernandez  miguel    hernandez            2013-08-14  Male  1947-04-18   69  Greater than 45             Other              0             1               0   

#### Data Splitting

In [4]:
# Data Splitting before preparation

train_data, test_data = train_test_split(raw_data, test_size=0.2, random_state=42)
print("Shape of training data:", train_data.shape)
print("Shape of testing data:", test_data.shape)

Shape of training data: (5771, 53)
Shape of testing data: (1443, 53)


#### Data Preparation

In [5]:
# Data Extraction
train_data = train_data[
    ['sex', 'dob', 'age', 'c_charge_degree', 'race', 'score_text', 'priors_count', 'days_b_screening_arrest',
     'decile_score', 'is_recid', 'two_year_recid', 'c_jail_in', 'c_jail_out']]

test_data = test_data[
    ['sex', 'dob', 'age', 'c_charge_degree', 'race', 'score_text', 'priors_count', 'days_b_screening_arrest',
     'decile_score', 'is_recid', 'two_year_recid', 'c_jail_in', 'c_jail_out']]

print("Training data:")
print(train_data.head().to_string())
print("Shape:", train_data.shape)

print("Testing data:")
print(test_data.head().to_string())
print("Shape:", test_data.shape)

Training data:
         sex         dob  age c_charge_degree              race score_text  priors_count  days_b_screening_arrest  decile_score  is_recid  two_year_recid            c_jail_in           c_jail_out
3307    Male  1987-04-29   28               F         Caucasian        Low             2                     -1.0             2         1               1  2013-10-03 03:57:15  2013-10-13 05:03:29
911     Male  1969-06-14   46               F  African-American        Low             2                      0.0             2         1               1  2013-01-22 02:31:25  2013-01-31 09:56:34
6532  Female  1993-09-04   22               M  African-American     Medium             0                     -1.0             6         0               0  2013-10-07 03:03:48  2013-10-08 08:59:33
6233    Male  1989-06-08   26               F  African-American        Low             1                     -1.0             4         0               0  2013-10-11 09:46:40  2013-10-12 07:56:28
3355 

In [6]:
# Data Filtering
train_data = train_data[(train_data['days_b_screening_arrest'] <= 30) & (train_data['days_b_screening_arrest'] >= -30)]
train_data = train_data[train_data['is_recid'] != -1]
train_data = train_data[train_data['c_charge_degree'] != "O"]
train_data = train_data[train_data['score_text'] != 'N/A']


train_data = train_data.replace('Medium', "Low")
test_data = test_data.replace('Medium', "Low")

print("Training data:")
print(train_data.head().to_string())
print("Shape:", train_data.shape)

print("Testing data:")
print(test_data.head().to_string())
print("Shape:", test_data.shape)

Training data:
         sex         dob  age c_charge_degree              race score_text  priors_count  days_b_screening_arrest  decile_score  is_recid  two_year_recid            c_jail_in           c_jail_out
3307    Male  1987-04-29   28               F         Caucasian        Low             2                     -1.0             2         1               1  2013-10-03 03:57:15  2013-10-13 05:03:29
911     Male  1969-06-14   46               F  African-American        Low             2                      0.0             2         1               1  2013-01-22 02:31:25  2013-01-31 09:56:34
6532  Female  1993-09-04   22               M  African-American        Low             0                     -1.0             6         0               0  2013-10-07 03:03:48  2013-10-08 08:59:33
6233    Male  1989-06-08   26               F  African-American        Low             1                     -1.0             4         0               0  2013-10-11 09:46:40  2013-10-12 07:56:28
3355 

In [7]:
# Binarizing labels
train_labels = label_binarize(train_data['score_text'], classes=['High', 'Low'])
test_labels = label_binarize(test_data['score_text'], classes=['High', 'Low'])

In [8]:
# Data Preparation Pipeline (Imputation, Encoding, Discretization)
impute1_and_onehot = Pipeline([('imputer1', SimpleImputer(strategy='most_frequent')),
                               ('onehot', OneHotEncoder(handle_unknown='ignore'))])
impute2_and_bin = Pipeline([('imputer2', SimpleImputer(strategy='mean')),
                            ('discretizer', KBinsDiscretizer(n_bins=4, encode='ordinal', strategy='uniform'))])

featurizer = ColumnTransformer(transformers=[
    ('impute1_and_onehot', impute1_and_onehot, ['is_recid']),
    ('impute2_and_bin', impute2_and_bin, ['age'])
])

pipeline = Pipeline([
    ('features', featurizer),
    ('classifier', LogisticRegression())
])

#### Model Training and Evaluation

In [9]:
# Model Evaluation
pipeline.fit(train_data, train_labels.ravel())
print(pipeline.score(test_data, test_labels.ravel()))

0.796950796950797


In [10]:
# Classification Report
print(classification_report(test_labels, pipeline.predict(test_data)))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       293
           1       0.80      1.00      0.89      1150

    accuracy                           0.80      1443
   macro avg       0.40      0.50      0.44      1443
weighted avg       0.64      0.80      0.71      1443



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
