### Incorrect Pipeline #1
##### Pipeline splits raw data into train and test at the end of data preparation
##### Pipeline uses an extract of the COMPAS dataset at datasets\compas-scores-two-years.csv

In [25]:
# All imports

import os
from pathlib import Path
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, KBinsDiscretizer, label_binarize
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [26]:
# Getting the project root
project_root = Path.cwd().parent.parent
print(project_root)

c:\Users\Shreya\OneDrive\Desktop\data_prep_issues


In [27]:
# Getting the raw data file
raw_data_file = os.path.join(project_root, "datasets", "compas-scores-two-years.csv")
raw_data = pd.read_csv(raw_data_file)
print(raw_data.head().to_string())
print("Shape:", raw_data.shape)

   id                name   first         last compas_screening_date   sex         dob  age          age_cat              race  juv_fel_count  decile_score  juv_misd_count  juv_other_count  priors_count  days_b_screening_arrest            c_jail_in           c_jail_out  c_case_number c_offense_date c_arrest_date  c_days_from_compas c_charge_degree                   c_charge_desc  is_recid  r_case_number r_charge_degree  r_days_from_arrest r_offense_date                r_charge_desc   r_jail_in  r_jail_out  violent_recid  is_violent_recid vr_case_number vr_charge_degree vr_offense_date               vr_charge_desc  type_of_assessment  decile_score.1 score_text screening_date v_type_of_assessment  v_decile_score v_score_text v_screening_date  in_custody out_custody  priors_count.1  start   end  event  two_year_recid
0   1    miguel hernandez  miguel    hernandez            2013-08-14  Male  1947-04-18   69  Greater than 45             Other              0             1               0   

##### Data Preparation

In [28]:
# Data preparation steps. Note: Data has not been split yet.

# Data Extraction
raw_data = raw_data[
    ['sex', 'dob', 'age', 'c_charge_degree', 'race', 'score_text', 'priors_count', 'days_b_screening_arrest',
     'decile_score', 'is_recid', 'two_year_recid', 'c_jail_in', 'c_jail_out']]

print(raw_data.head().to_string())
print("Shape:", raw_data.shape)

    sex         dob  age c_charge_degree              race score_text  priors_count  days_b_screening_arrest  decile_score  is_recid  two_year_recid            c_jail_in           c_jail_out
0  Male  1947-04-18   69               F             Other        Low             0                     -1.0             1         0               0  2013-08-13 06:03:42  2013-08-14 05:41:20
1  Male  1982-01-22   34               F  African-American        Low             0                     -1.0             3         1               1  2013-01-26 03:45:27  2013-02-05 05:36:53
2  Male  1991-05-14   24               F  African-American        Low             4                     -1.0             4         1               1  2013-04-13 04:58:34  2013-04-14 07:02:04
3  Male  1993-01-21   23               F  African-American       High             1                      NaN             8         0               0                  NaN                  NaN
4  Male  1973-01-22   43               F     

In [29]:
# Data Filtering
raw_data = raw_data[(raw_data['days_b_screening_arrest'] <= 30) & (raw_data['days_b_screening_arrest'] >= -30)]
raw_data = raw_data[raw_data['is_recid'] != -1]
raw_data = raw_data[raw_data['c_charge_degree'] != "O"]
raw_data = raw_data[raw_data['score_text'] != 'N/A']


raw_data = raw_data.replace('Medium', "Low")

print(raw_data.head().to_string())
print("Shape:", raw_data.shape)

    sex         dob  age c_charge_degree              race score_text  priors_count  days_b_screening_arrest  decile_score  is_recid  two_year_recid            c_jail_in           c_jail_out
0  Male  1947-04-18   69               F             Other        Low             0                     -1.0             1         0               0  2013-08-13 06:03:42  2013-08-14 05:41:20
1  Male  1982-01-22   34               F  African-American        Low             0                     -1.0             3         1               1  2013-01-26 03:45:27  2013-02-05 05:36:53
2  Male  1991-05-14   24               F  African-American        Low             4                     -1.0             4         1               1  2013-04-13 04:58:34  2013-04-14 07:02:04
5  Male  1971-08-22   44               M             Other        Low             0                      0.0             1         0               0  2013-11-30 04:50:18  2013-12-01 12:28:56
6  Male  1974-07-23   41               F     

In [30]:
# Data Splitting

train_data, test_data = train_test_split(raw_data, test_size=0.2, random_state=42)
print("Shape of training data:", train_data.shape)
print("Shape of testing data:", test_data.shape)

Shape of training data: (4937, 13)
Shape of testing data: (1235, 13)


In [31]:
train_labels = label_binarize(train_data['score_text'], classes=['High', 'Low'])
test_labels = label_binarize(test_data['score_text'], classes=['High', 'Low'])

In [32]:
impute1_and_onehot = Pipeline([('imputer1', SimpleImputer(strategy='most_frequent')),
                               ('onehot', OneHotEncoder(handle_unknown='ignore'))])
impute2_and_bin = Pipeline([('imputer2', SimpleImputer(strategy='mean')),
                            ('discretizer', KBinsDiscretizer(n_bins=4, encode='ordinal', strategy='uniform'))])

featurizer = ColumnTransformer(transformers=[
    ('impute1_and_onehot', impute1_and_onehot, ['is_recid']),
    ('impute2_and_bin', impute2_and_bin, ['age'])
])

pipeline = Pipeline([
    ('features', featurizer),
    ('classifier', LogisticRegression())
])

In [33]:
pipeline.fit(train_data, train_labels.ravel())
print(pipeline.score(test_data, test_labels.ravel()))

0.8186234817813766


In [35]:
print(classification_report(test_labels, pipeline.predict(test_data)))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       224
           1       0.82      1.00      0.90      1011

    accuracy                           0.82      1235
   macro avg       0.41      0.50      0.45      1235
weighted avg       0.67      0.82      0.74      1235



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
