In [37]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
from sklearn import preprocessing
import numpy as np
from sklearn.metrics import confusion_matrix

In [38]:
df = pd.read_csv(r'../Finalized Data/final_with_doc.csv')

In [39]:
del df['unanimous']

In [40]:
del df['File']

In [41]:
df['days_jail_doc'] = df['days_jail_doc'].apply(lambda x: int(x.split('days')[0]))

In [42]:
1- len(np.where(df['Y'] == 1)[0])/len(df)

0.7431028610354223

In [43]:
y = df['Y']
X = df.loc[:, df.columns != 'Y']

In [44]:
normalized_X = preprocessing.normalize(X)
X_Normalized = pd.DataFrame(normalized_X, columns = X.columns)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_Normalized, y, test_size=0.3,random_state=42)

    Baseline KNN Model

In [20]:
# instantiate learning model (k = 3)
knn = KNeighborsClassifier(n_neighbors=3)
# fitting the model
knn.fit(X_train, y_train)
# predict the response
pred = knn.predict(X_test)
# evaluate accuracy
print (accuracy_score(y_test, pred))

0.766414380322


In [21]:
# creating odd list of K for KNN
myList = list(range(1,70))

# subsetting just the odd ones
neighbors = list(filter(lambda x: x % 2 != 0, myList))

# empty list that will hold cv scores
cv_scores = []

# perform 10-fold cross validation
for k in neighbors:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X_train, y_train, cv=10, scoring='accuracy')
    cv_scores.append(scores.mean())

In [22]:
optimal_k = neighbors[cv_scores.index(max(cv_scores))]
print("optimal k : %s" % optimal_k)

optimal k : 23


In [23]:
# instantiate learning model (k = 3)
knn = KNeighborsClassifier(n_neighbors=optimal_k)
# fitting the model
knn.fit(X_train, y_train)
# predict the response
pred = knn.predict(X_test)
# evaluate accuracy

In [24]:
print("Accuracy: %s" % (accuracy_score(y_test, pred)))
cm = confusion_matrix(pred,y_test)
print(cm)

Accuracy: 0.804446546831
[[7577 1799]
 [ 268  926]]


    Understand features

In [25]:
from scipy.stats import entropy
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve, auc

In [49]:
dt = DecisionTreeClassifier(criterion = 'entropy', max_depth = 20)
dt.fit(X_train, y_train)

#Now use built in feature importance attribute to get MI of each feature and Y
feature_mi = dt.feature_importances_
feature_mi_dict = dict(zip(X_train.columns.values, feature_mi))


In [50]:
def FeatureROC(preds, truth, label_string):
    '''
    preds is an nx1 array of predictions
    truth is an nx1 array of truth labels
    label_string is text to go into the plotting label
    '''
    #Student input code here
    #1. call the roc_curve function to get the ROC X and Y values
    fpr, tpr, thresholds = roc_curve(truth, preds)
    #2. Input fpr and tpr into the auc function to get the AUC
    roc_auc = auc(fpr, tpr)
    
    #we are doing this as a special case because we are sending unfitted predictions
    #into the function
    if roc_auc < 0.5:
        fpr, tpr, thresholds = roc_curve(truth, -1 * preds)
        roc_auc = auc(fpr, tpr)
    
    return roc_auc

In [51]:
feature_auc_dict = {}
for col in X_train.columns:
    feature_auc_dict[col] = FeatureROC(X_train[col], y_train, col)

In [52]:
df_auc = pd.DataFrame(pd.Series(feature_auc_dict), columns = ['auc'])
df_mi = pd.DataFrame(pd.Series(feature_mi_dict), columns = ['mi'])

#Now merge the two on the feature name
feat_imp_df = df_auc.merge(df_mi, left_index = True, right_index = True)

In [102]:
feat_ranks = feat_imp_df.rank(axis = 0, ascending = False)
#Now create lists of top 5 features for both auc and mi
top5_auc = list(feat_ranks[(feat_ranks.auc <= 30)].index.values)
top5_mi = list(feat_ranks[(feat_ranks.mi <= 30)].index.values)
top5_auc, top5_mi

(['CC',
  'Gap_days',
  'SC-AJ',
  'Unanimous',
  'a_Friday',
  'a_Tuesday',
  'contested_nan',
  'da_black_nan',
  'da_female_nan',
  'da_university_nan',
  'defense__nan',
  'degree_crime_nan',
  'election_nan',
  'f_Thursday',
  'female',
  'judge_APD_missing',
  'judge_CC_missing',
  'judge_Prof_missing',
  'judge_SC-AJ & SC_missing',
  'judge_SC-AJ_missing',
  'judge_SC_missing',
  'judge_elect_missing',
  'judge_female_missing',
  'judge_info_missing',
  'judge_law_school_missing',
  'judge_military_missing',
  'judge_startyear_missing',
  'nycourts',
  'start year',
  'type_crime_nan'],
 ['AppealD_2008',
  'AppealD_month_x',
  'AppealD_month_y',
  'FirstD_month_x',
  'FirstD_month_y',
  'Gap_days',
  'SC-AJ',
  'Unanimous',
  'a_Friday',
  'a_Monday',
  'a_Thursday',
  'a_Tuesday',
  'a_Wednesday',
  'appellant',
  'contested_nan',
  'da_2005',
  'days_jail_doc',
  'defense__legal aid society',
  'defense__nan',
  'degree_crime_nan',
  'election_nan',
  'f_Friday',
  'f_Monday',

In [85]:
for weights in ['uniform', 'distance']:
    clf = KNeighborsClassifier(n_neighbors=optimal_k, weights=weights)
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    print("Accuracy for %s: %s" % (weights,accuracy_score(y_test, pred)))
 

Accuracy for uniform: 0.800473036897
Accuracy for distance: 0.803405865658


In [77]:
l = ['SC-AJ',
  'judge_APD_missing',
  'judge_CC_missing',
  'judge_Prof_missing',
  'judge_SC-AJ & SC_missing',
  'judge_SC-AJ_missing',
  'judge_SC_missing',
  'judge_elect_missing',
  'judge_female_missing',
  'judge_info_missing',
  'judge_law_school_missing',
  'judge_military_missing',
  'judge_startyear_missing',
  'nycourts']

X_train_1 = X_train.drop(l,axis = 1)
X_test_1 = X_test.drop(l,axis = 1)