In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
# Clean data using helper function
data = pd.read_csv('../data/churn_train.csv')
data_clean = clean_churn_data(data)

# Create a copy of data for making models
df_copy = data_clean.copy()
y = df_copy.pop('churn?').values
X = df_copy.values

# Create train test split
X_train, X_test, y_train, y_test = train_test_split(X,y)
X_train.shape, X_test.shape

In [None]:
# Make model
model = GradientBoostingClassifier()
model.fit(X_train, y_train)
y_hat = model.predict(X_test)

In [None]:
# Feature Importances
importances = pd.DataFrame({'Feature Importances': model.feature_importances_}, 
                       index=df_copy.columns)

importances = importances.sort_values(by='Feature Importances')
importances.plot(kind='barh')
plt.title("Feature Importances in Gradient Boosting Regressor")
plt.savefig("../images/feature_import_gbc.png", bbox_inches='tight', dpi=350)

In [None]:

# First look at n_estimators
n = 3000
model = GradientBoostingClassifier(n_estimators=n)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

train_score, test_score = train_test_score(3000, model, X_train, X_test, y_train, y_test)

In [None]:
fig, ax = plt.subplots()
ax.plot(train_score[:1000], label="Training Errors")
ax.plot(test_score[:1000], label="Testing Errors")
ax.set_xlabel("Number of Estimators/Trees")
ax.set_ylabel("Average Errors")
ax.legend()
ax.set_title("Testing and Training Errors vs N_estimators")
ax.axvline(x=826, color='black', linewidth=0.4)
plt.savefig("../img/test_train_errors_gbc.png", dpi=250)

opt_estimators = np.argmin(test_score)
print(opt_estimators)

In [None]:
# Now look at learning rate and plot ROC curve for each
lr = [0.01, 0.05, 0.1, 0.5, 1]
fig, ax = plt.subplots()

# Loop through learning rates
for l in lr:
    n_estimators = 1000
    m = GradientBoostingClassifier(learning_rate=l, n_estimators=n_estimators)
    m.fit(X_train, y_train)
    
    train_score, test_score = train_test_score(n_estimators, m, X_train, X_test, y_train, y_test)
    
    ax.plot(test_score, label=l)
    ax.set_title("Test Errors by Learning Rate")
    ax.set_xlabel("N Estimators")
    ax.set_ylabel("Test Errors")    
    ax.legend()
    plt.savefig('../img/lr_errors_gbc.png', dpi=250)