In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from itertools import product
from matplotlib import pyplot as plt
from sklearn.model_selection import GridSearchCV

### Read Data and Split into test and train

In [None]:
data = pd.read_csv('../Finalized Data/final_with_doc.csv')

In [None]:
data.shape

In [None]:
data.head()

In [None]:
sum(data.Y[data.Y==1])/data.shape[0]

In [None]:
data.drop(['FirstD_month_x', 'AppealD_month_x', 'File', 'Unanimous', 'days_jail_doc'],axis=1,inplace=True)

In [None]:
data.columns.values

In [None]:
y = data.pop('Y')

In [None]:
X = data

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', 3):
    print(X.dtypes)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.25)

### Build initial model for testing

In [None]:
model = GradientBoostingClassifier(n_estimators=100, learning_rate =0.1 )
model.fit(X_train, y_train)

In [None]:
forecast = model.predict(X_test)

In [None]:
plot_roc(y_test, forecast,'Gradient Boosting Untuned')
plt.show()

In [None]:
def plot_roc(actual,forecast,name):
    
    #ROC
    fpr, tpr, thresholds = metrics.roc_curve(actual, forecast)

    #AUC
    aucc = metrics.auc(fpr, tpr)

    plt.plot(fpr, tpr, label = name + ' AUC: '+str(round(aucc, 4)))
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curves')
    plt.grid(True)
    plt.legend(loc = 'lower right', title='AUC')

In [None]:
def show_most_informative_features(model, names, n=20):
    feature_names = names
    coefs = model.feature_importances_
    coefs_with_fns = sorted(zip(coefs, feature_names),reverse=True)
    top = coefs_with_fns[:n]
    for (coef_1, fn_1) in top:
        
        print("\t%.4f\t%-15s\t" % (coef_1, fn_1))

In [None]:
show_most_informative_features(model, X_train.columns.values)

### Fine Tuning

In [None]:
#Grid search
param_grid = [
    {'n_estimators' : [20,50,100,200],
     'learning_rate': [0.1, 0.05],
      'max_depth': [2,5],
      'min_samples_split': [5,10]}  
]

In [None]:
grid_search = GridSearchCV(GradientBoostingClassifier(), param_grid=param_grid, scoring= 'roc_auc')

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
best_gbt = GradientBoostingClassifier(learning_rate=0.1, max_depth=5, n_estimators=200)

In [None]:
best_gbt.fit(X_train, y_train)

In [None]:
forecast = best_gbt.predict(X_test)

In [None]:
plot_roc(y_test, forecast,'Gradient Boosting Untuned')
plt.show()

In [None]:
show_most_informative_features(best_gbt, X_train.columns.values)

In [None]:
X_train.drop('')