In [115]:
from sklearn.preprocessing import OneHotEncoder

In [116]:
import pandas as pd
df = pd.read_csv('3_year_recidivism_elaborated_2.csv')
df

Unnamed: 0,Fiscal Year Released,Recidivism Reporting Year,Race - Ethnicity,Age At Release,Convicting Offense Classification,Convicting Offense Type,Main Supervising District,Release type: Paroled to Detainer united,Part of Target Population,Recidivism - Return to Prison numeric
0,2010,2013,White - Non-Hispanic,Under 25,D Felony,Violent,4JD,Parole,Yes,1
1,2010,2013,White - Non-Hispanic,55 and Older,D Felony,Public Order,7JD,Parole,Yes,1
2,2010,2013,White - Non-Hispanic,25-34,D Felony,Property,5JD,Parole,Yes,1
3,2010,2013,White - Non-Hispanic,55 and Older,C Felony,Drug,8JD,Parole,Yes,1
4,2010,2013,Black - Non-Hispanic,25-34,D Felony,Drug,3JD,Parole,Yes,1
...,...,...,...,...,...,...,...,...,...,...
26015,2015,2018,White - Hispanic,25-34,C Felony,Violent,Not Mentioned,Discharged End of Sentence,Yes,0
26016,2015,2018,White - Non-Hispanic,25-34,D Felony,Property,5JD,Paroled to Detainer,No,0
26017,2015,2018,Black - Non-Hispanic,Under 25,Aggravated Misdemeanor,Violent,1JD,Discharged End of Sentence,Yes,0
26018,2015,2018,White - Non-Hispanic,Under 25,D Felony,Drug,5JD,Parole,No,0


In [117]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26020 entries, 0 to 26019
Data columns (total 10 columns):
 #   Column                                    Non-Null Count  Dtype 
---  ------                                    --------------  ----- 
 0   Fiscal Year Released                      26020 non-null  int64 
 1   Recidivism Reporting Year                 26020 non-null  int64 
 2   Race - Ethnicity                          26020 non-null  object
 3   Age At Release                            26020 non-null  object
 4   Convicting Offense Classification         26020 non-null  object
 5   Convicting Offense Type                   26020 non-null  object
 6   Main Supervising District                 26020 non-null  object
 7   Release type: Paroled to Detainer united  26020 non-null  object
 8   Part of Target Population                 26020 non-null  object
 9   Recidivism - Return to Prison numeric     26020 non-null  int64 
dtypes: int64(3), object(7)
memory usage: 2.0+ MB


Here, I can see there are many string-type variables, but each column fortunately has the same amount of rows. Because of this, I will need to do simple feature engineering like one hot encoding

In [129]:
x = df.iloc[:,:-1].values #target
y = df.iloc[:,-1].values
enc = OneHotEncoder(sparse=False)
one_hot_x = enc.fit_transform(x)


In [130]:
from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y = train_test_split(one_hot_x, y, test_size=0.4)
test_x

array([[0., 0., 1., ..., 0., 1., 0.],
       [0., 0., 1., ..., 0., 1., 0.],
       [1., 0., 0., ..., 0., 1., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 1., ..., 0., 1., 0.],
       [0., 0., 0., ..., 1., 0., 1.]])

In [131]:
train_x.shape
#we want number of rows to be much larger than the number of features/columns.

(15612, 68)

In [132]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(max_iter=1000)
clf.fit(train_x, train_y)

LogisticRegression(max_iter=1000)

In [133]:
hyp = clf.predict(test_x)
hyp

array([0, 0, 0, ..., 0, 0, 1])

In [134]:
from sklearn.metrics import accuracy_score
accuracy_score(test_y, clf.predict(test_x))

0.6834166026133743

In [135]:
from sklearn.metrics import confusion_matrix
confusion_matrix(test_y, hyp)

array([[6521,  485],
       [2810,  592]])

when it thinks someone is not recidivised, it is right 6520 times. there are 6520 true positives, and 559 true negatives. There are a lot of false negatives

In [136]:
from sklearn.model_selection import GridSearchCV
parameters = {'penalty': ['l1', 'l2'],     #two types of penalized regressions: ridge and lasso. 
             'C': [1e-4, 1e-3, 1e-2, 0.1, 1, 10, 100, 1000, 10000]} #constraints 
#l1 corresponds to lasso, l2 corresponds to ridge

lr = LogisticRegression(solver='liblinear')
clf = GridSearchCV(lr, parameters, cv=5) #classifier is gridsearchCV, pass in logistic regression, the parameters, how many folds you want
clf.fit(train_x, train_y)

GridSearchCV(cv=5, estimator=LogisticRegression(solver='liblinear'),
             param_grid={'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000,
                               10000],
                         'penalty': ['l1', 'l2']})

In [137]:
clf.best_score_

0.6682685323475699

In [138]:
accuracy_score(test_y, clf.predict(test_x))


0.6840891621829363

Overall, we have about a 68% accuracy through training with logistic regression. Though this could be worse, it is not where I would like it to be