In [None]:
import pandas as pd
import numpy as np
from google.cloud import bigquery
client = bigquery.Client()
import matplotlib.pylab as plt
import seaborn as sns
import sklearn
%matplotlib inline

# Importing Table

In [None]:
full_table = """
SELECT
  DATE_DIFF("2019-09-01", DTV_Last_Activation_Dt, month) AS DTV_LA_Mon_Before_2019_09_01,
  DATE_DIFF("2019-09-01", Sports_Last_Activation_Dt, month) AS Sports_LA_Mon_Before_2019_09_01,
  DATE_DIFF(Sports_Last_Activation_Dt, DTV_Last_Activation_Dt, month) AS DateDiff_months,
  Offers_Applied_Ever_Sports,
  h_age_coarse,
  h_number_of_adults,
  h_number_of_children_in_hh,
  DTV_Product_Holding,
  Curr_Offer_Amount_Sports,
  Curr_Offer_Length_Sports,
  Target_sports_downgrade
FROM
  `sky-uk-ids-analytics-prod.NPR13.Grad_Example_Propensity_Mart_Sports_Downgrades`
"""
full = client.query(full_table).to_dataframe()
full.head(5)

In [None]:
# Taking M off the offer length
full['Curr_Offer_Length_Sports'] = full['Curr_Offer_Length_Sports'].str.rstrip('M')
full['Curr_Offer_Length_Sports'] = full['Curr_Offer_Length_Sports'].astype('float64')

In [None]:
full.info()

# Visualising

### Age

In [None]:
ax = full['h_age_coarse'].value_counts().reindex(["Unknown", "18-25", "26-35","36-45","46-55","56-65","66+"])
ax.plot(kind='bar', figsize=(10,5), title="Number of people in age range", xlabel = "Age Range", ylabel = "Frequency")
plt.show()

In [None]:
plt.figure(figsize=(25, 7))

plt.subplot(1,3,1)
full.groupby('h_age_coarse')['h_number_of_children_in_hh'].agg(np.mean).plot(kind = "bar")
plt.xlabel('Age')
plt.ylabel('Average Children in hh')

plt.subplot(1,3,2)
full.groupby('h_age_coarse')['h_number_of_adults'].agg(np.mean).plot(kind = "bar")
plt.xlabel('Age')
plt.ylabel('Average Adults in hh')

plt.subplot(1,3,3)
full.groupby('h_number_of_children_in_hh')['h_number_of_adults'].agg(np.mean).plot(kind = "bar")
plt.xlabel('Number children in hh')
plt.ylabel('Average adults in hh')
plt.show()

Number of children in household could predict age or visa-versa (looks Gaussian), so may want to use an iterative imputer for imputing null values.

### Current offer length for sports

In [None]:
sns.histplot(full['Curr_Offer_Length_Sports'],bins = (12,24,100,800,1000))
plt.show()

Can see that there are many unrealistic offers (i.e. over 5 years) so we shall consider bucketing these offers when pre-processing to make sure the offers over 800 stand out

In [None]:
Off_len_down = """
SELECT
  Curr_Offer_Length_Sports, SUM(Target_sports_downgrade) AS Downgrades
FROM
  `sky-uk-ids-analytics-prod.NPR13.Grad_Example_Propensity_Mart_Sports_Downgrades`
WHERE Target_sports_downgrade = 1
AND Curr_Offer_Length_Sports IS NOT NULL
GROUP BY Curr_Offer_Length_Sports
"""
offerlen_info = client.query(Off_len_down).to_dataframe()

In [None]:
offerlen_info = client.query(Off_len_down).to_dataframe()
offerlen_info['Curr_Offer_Length_Sports'] = offerlen_info['Curr_Offer_Length_Sports'].str.rstrip('M')
offerlen_info['Curr_Offer_Length_Sports'] = offerlen_info['Curr_Offer_Length_Sports'].astype('float64')
offerlen_info = offerlen_info.reindex([3, 4, 1, 5, 0, 9, 6, 7, 2, 8])
offerlen_info

In [None]:
plt.plot(offerlen_info['Curr_Offer_Length_Sports'], offerlen_info['Downgrades'].cumsum(), color = 'blue', alpha = 1)
plt.xlabel("Offer Length")
plt.ylabel("Cumulative Downgrades")
plt.show()

### DTV Product Holding

In [None]:
holdings_downgrades = """
SELECT
  DTV_Product_Holding, SUM(Target_sports_downgrade) AS Total_downgrades, COUNT(*) as Num_with_holding
FROM
  `sky-uk-ids-analytics-prod.NPR13.Grad_Example_Propensity_Mart_Sports_Downgrades`
GROUP BY DTV_Product_Holding
"""
product_info = client.query(holdings_downgrades).to_dataframe()
product_info[['DTV_Product_Holding']] = product_info[['DTV_Product_Holding']].fillna("Unknown")
product_info

In [None]:
# Number of customers vs total downgrades
# Downgrades for each product holding
plt.figure(figsize=(15,10))
plt.subplot(1, 2, 1)
plt.pie(product_info['Total_downgrades'], labels = product_info['DTV_Product_Holding'], autopct='%1.1f%%')
plt.title("Proportion of downgrades from DTV product")
plt.subplot(1, 2, 2)
plt.pie(product_info['Num_with_holding'], labels = product_info['DTV_Product_Holding'], autopct='%1.1f%%')
plt.title("Proportion of customers with DTV product")
plt.show()

Will use One Hot Encoder to encode these categories (but drop one of them, most likely the unknown category, as this will contain all information about Product Holdings)

### Offers and downgrades

In [None]:
Offers_downgrades = """
SELECT
  Offers_Applied_Ever_Sports, AVG(Curr_Offer_Amount_Sports) AS Average_offer, SUM(Target_sports_downgrade) AS Total_downgrades,
FROM
  `sky-uk-ids-analytics-prod.NPR13.Grad_Example_Propensity_Mart_Sports_Downgrades`
GROUP BY Offers_Applied_Ever_Sports
"""
offer_info = client.query(Offers_downgrades).to_dataframe()
offer_info.head()

In [None]:
plt.bar(offer_info['Offers_Applied_Ever_Sports'],offer_info['Average_offer'])
plt.xlabel('Offers applied ever')
plt.ylabel('Average offer')
plt.show()

In [None]:
plt.bar(offer_info['Offers_Applied_Ever_Sports'],offer_info['Total_downgrades'])
plt.show()

### Last Activation date and DateDiff

In [None]:
f, axs = plt.subplots(1,3,figsize=(30,5))
plt.subplot(1, 3, 1)
sns.histplot(full['DateDiff_months'],binwidth = 20)
plt.subplot(1, 3, 2)
sns.histplot(full['DTV_LA_Mon_Before_2019_09_01'],binwidth = 20)
plt.subplot(1, 3, 3)
sns.histplot(full['Sports_LA_Mon_Before_2019_09_01'],binwidth = 20)
plt.show()

# Pre-Processing

In [None]:
# Taking median of each age range and setting unknowns to null so that we can impute
full['h_age_coarse'] = full['h_age_coarse'].replace("66+",value=73)
full['h_age_coarse'] = full['h_age_coarse'].replace("18-25",value=21.5)
full['h_age_coarse'] = full['h_age_coarse'].replace("26-35",value=30.5)
full['h_age_coarse'] = full['h_age_coarse'].replace("36-45",value=40.5)
full['h_age_coarse'] = full['h_age_coarse'].replace("46-55",value=50.5)
full['h_age_coarse'] = full['h_age_coarse'].replace("56-65",value=60.5)
full['h_age_coarse'] = full['h_age_coarse'].replace("Unknown",value=np.nan)
full.head()

In [None]:
# Split into features and target
X = full.iloc[:, 0:10]
y = full.iloc[:, 10]

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from imblearn.pipeline import Pipeline as iPipeline
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from optbinning import BinningProcess

In [None]:
# Making this all into a pipeline
Offer_am = ['Curr_Offer_Amount_Sports']
Offer_am_transformer = Pipeline(steps=[
    ('zero imputer', SimpleImputer(strategy='constant', fill_value=0, add_indicator=False))
    ])

Offer_len = ['Curr_Offer_Length_Sports']
Offer_len_transformer = Pipeline(steps=[
    ('zero imputer', SimpleImputer(strategy='constant', fill_value=0, add_indicator=True))
    ])

mean_features = ['h_number_of_adults', 'h_number_of_children_in_hh', 'h_age_coarse']
mean_transformer = Pipeline(steps=[
    ('Iterative impute', IterativeImputer(n_nearest_features=3))
    ])

categorical_features = ['DTV_Product_Holding']
categorical_transformer = Pipeline(steps=[
    ('Unknown to null', SimpleImputer(missing_values='Unknown', strategy='constant', fill_value=np.nan)),
    ('One Hot Encoding', OneHotEncoder(sparse=False, drop='first'))
    ])

#'DTV_LA_Mon_Before_2019_09_01' removed due to collinearity
Dates_LA = ['Sports_LA_Mon_Before_2019_09_01']
Dates_LA_transformer = Pipeline(steps=[
    ('Null to zero', SimpleImputer(strategy='constant', fill_value=0))
    ])

Date_Diff_OE = ['DateDiff_months', 'Offers_Applied_Ever_Sports']
Dates_Diff_OE_transformer = Pipeline(steps=[
    ('Null to median', SimpleImputer(strategy='median'))
    ])


In [None]:
# Pipeline to clean data and impute
Cleaner = ColumnTransformer([
    ("Offer Ammount", Offer_am_transformer, Offer_am),
    ("Offer Length Nulls", Offer_len_transformer, Offer_len),
    ("Means", mean_transformer, mean_features),
    ("Onehot", categorical_transformer, categorical_features),
    ("Dates LA", Dates_LA_transformer, Dates_LA),
    ("Dates Difference and OE", Dates_Diff_OE_transformer, Date_Diff_OE),
    ])

In [None]:
# Defining sampling functions and pipeline to deal with imbalance
oversample = RandomOverSampler(sampling_strategy=0.5)
undersample = RandomUnderSampler(sampling_strategy=0.5)

Sampler = iPipeline([
    ('Oversampling', oversample),
    ('Undersampling', undersample)
    ])

In [None]:
X_train_sampled, y_train_sampled = Sampler.fit_resample(X_train, y_train)
#X_train_sampled, y_train_sampled = X_train, y_train
X_train_t = Cleaner.fit_transform(X_train_sampled)
X_test_t = Cleaner.transform(X_test)

In [None]:
X_train_t.shape

In [None]:
column_names = ['Curr_Offer_Amount_Sports', 'Curr_Offer_Length_Sports', 'Curr_Off_Len_null_ind', 'hh_adults', 'hh_children', 'age_est', 'Or_ind', 'OrLg2017_ind', 'OrLg_ind', 'Ent_ind', 'SkyQ_ind', 'Var_ind', 'Unknown_ind', 'Sports_LA_Mon_Before_2019_09_01', 'DateDiff_months', 'Offers_Applied_Ever_Sports']

In [None]:
# Checking for correlated features
df_train = pd.DataFrame(X_train_t, columns = column_names)
df_test = pd.DataFrame(X_test_t, columns = column_names)
plt.figure(figsize=(20,10))
sns.heatmap(df_train.corr(),annot=True)
plt.show()

In [None]:
# Removing correlated features
cols_to_remove = ['Curr_Offer_Amount_Sports']
X_train_r = df_train.drop(columns = cols_to_remove)
X_test_r = df_test.drop(columns = cols_to_remove)

In [None]:
columns = [x for x in column_names if x not in cols_to_remove]
categorical_variables = ['Curr_Off_Len_null_ind', 'Or_ind', 'OrLg2017_ind', 'OrLg_ind', 'Ent_ind', 'SkyQ_ind', 'Var_ind', 'Unknown_ind']

# Models

In [None]:
from sklearn.feature_selection import RFE, SequentialFeatureSelector
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [None]:
binning_process = BinningProcess(columns,
                                 categorical_variables=categorical_variables)

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Optbinning
rfcP = Pipeline(steps=[('binning_process', binning_process),
                     ('scaling', StandardScaler()),
                     ('Random_Forest', RandomForestClassifier(n_estimators = 100, max_depth=6))])

rfcP.fit(X_train_r, y_train_sampled)

predict_rfc = rfcP.predict(X_test_r)
predict_proba_rfc = rfcP.predict_proba(X_test_r)

In [None]:
print("AUC score for Random Forest Pipeline is " + str(sklearn.metrics.roc_auc_score(y_test, predict_proba_rfc[:,1])))
print(confusion_matrix(y_test, predict_rfc))
print(classification_report(y_test,predict_rfc))

In [None]:
binning_process.summary()

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
# Optbinning
lr = LogisticRegression(C=0.09, penalty='l1', solver='liblinear')

lrP = Pipeline(steps=[('binning_process', binning_process),
                     ('scaling', StandardScaler()),
                     ('regressor', lr)])

lrP.fit(X_train_r, y_train_sampled)

predict_lr = lrP.predict(X_test_r)
predict_proba_lr = lrP.predict_proba(X_test_r)

In [None]:
print("AUC score for Logistic Regression Pipeline is " + str(sklearn.metrics.roc_auc_score(y_test, predict_proba_lr[:,1])))
print(confusion_matrix(y_test, predict_lr))
print(classification_report(y_test,predict_lr))

## Decision Tree

In [None]:
from sklearn import tree

In [None]:
# Optbinning, Decision Tree Pipeline
dt = tree.DecisionTreeClassifier(max_depth=5)

dtP = Pipeline(steps=[('binning_process', binning_process),
                     ('scaling', StandardScaler()),
                     ('Decision_tree', dt)])

dtP.fit(X_train_r, y_train_sampled)

predict_dt = dtP.predict(X_test_r)
predict_proba_dt = dtP.predict_proba(X_test_r)

In [None]:
print("AUC score for Decision Tree Pipeline is " + str(sklearn.metrics.roc_auc_score(y_test, predict_proba_dt[:,1])))
print(confusion_matrix(y_test, predict_dt))
print(classification_report(y_test,predict_dt))

## Gradient Boosting Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
# Optbinning
gbcP = Pipeline(steps=[('binning_process', binning_process),
                     ('scaling', StandardScaler()),
                     ('GBC', GradientBoostingClassifier())])

gbcP.fit(X_train_r, y_train_sampled)

predict_gbc = gbcP.predict(X_test_r)
predict_proba_gbc = gbcP.predict_proba(X_test_r)

In [None]:
print("AUC score for GBC Pipeline is " + str(sklearn.metrics.roc_auc_score(y_test, predict_proba_gbc[:,1])))
print(confusion_matrix(y_test, predict_gbc))
print(classification_report(y_test,predict_gbc))

### ROC Curves, PR Curve and Lift

In [None]:
from sklearn import metrics, model_selection
from sklearn.metrics import roc_auc_score, precision_recall_curve, f1_score, auc

In [None]:
def lift(pred, y_test):
    lc = pd.DataFrame({'Sports target downgrade': y_test, 'Pobability of 1': pred[:,1]})
    lc.sort_values(by = 'Pobability of 1', ascending = False, inplace = True)
    len_first_decile = int(len(lc)/10)
    fd = lc[:len_first_decile]
    ones_total = sum(lc['Sports target downgrade'])
    av_ones = ones_total/len(lc)
    ones_td = sum(fd['Sports target downgrade'])
    gain_score = ones_td/len_first_decile
    lift = round(gain_score / av_ones, 2)
    return lift

In [None]:
classifiers = [rfcP, lrP, dtP, gbcP]
plt.figure(figsize=(7, 7))
ax = plt.gca()
metrics.plot_roc_curve(rfcP, X_test_r, y_test, ax=ax, label='RFC (area = %0.2f)' % metrics.roc_auc_score(y_test, predict_proba_rfc[:,1]))
metrics.plot_roc_curve(lrP, X_test_r, y_test, ax=ax, label='LR (area = %0.2f)' % metrics.roc_auc_score(y_test, predict_proba_lr[:,1]))
metrics.plot_roc_curve(dtP, X_test_r, y_test, ax=ax, label='DT (area = %0.2f)' % metrics.roc_auc_score(y_test, predict_proba_dt[:,1]))
metrics.plot_roc_curve(gbcP, X_test_r, y_test, ax=ax, label='GBC (area = %0.2f)' % metrics.roc_auc_score(y_test, predict_proba_gbc[:,1]))
plt.show()

In [None]:
AUCs = [metrics.roc_auc_score(y_test, predict_proba_rfc[:,1]), metrics.roc_auc_score(y_test, predict_proba_lr[:,1]), metrics.roc_auc_score(y_test, predict_proba_dt[:,1]), metrics.roc_auc_score(y_test, predict_proba_gbc[:,1])]
lifts = [lift(predict_proba_rfc, y_test), lift(predict_proba_lr, y_test), lift(predict_proba_dt, y_test), lift(predict_proba_gbc, y_test)]

In [None]:
Table = pd.DataFrame({'Model' : ["Random Forest", "Logistic Regression", "Decision Tree", "Gradient Boosting"], 'AUC' : AUCs, 'Lift on First Decile': lifts})

In [None]:
Table

In [None]:
rfc_precision, rfc_recall, _ = precision_recall_curve(y_test, predict_proba_rfc[:,1])

lr_precision, lr_recall, _ = precision_recall_curve(y_test, predict_proba_lr[:,1])

dt_precision, dt_recall, _ = precision_recall_curve(y_test, predict_proba_dt[:,1])

gbc_precision, gbc_recall, _ = precision_recall_curve(y_test, predict_proba_gbc[:,1])

# plot the precision-recall curves
no_skill = len(y_test[y_test==1]) / len(y_test)

plt.figure(figsize=(10, 7))
plt.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')
plt.plot(rfc_recall, rfc_precision, markersize=1, label='RFC')
plt.plot(lr_recall, lr_precision, markersize=1, label='LR')
plt.plot(dt_recall, dt_precision, markersize=1, label='DT')
plt.plot(gbc_recall, gbc_precision, markersize=1, label='GBC')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend()
plt.show()

## Without Sampling

In [None]:
X_train_n1 = Cleaner.fit_transform(X_train)
X_test_n1 = Cleaner.transform(X_test)

df_train_n = pd.DataFrame(X_train_n1, columns = column_names)
df_test_n = pd.DataFrame(X_test_n1, columns = column_names)

X_train_n = df_train_n.drop(columns = cols_to_remove)
X_test_n = df_test_n.drop(columns = cols_to_remove)

In [None]:
# Random Forest

rfcP_n = Pipeline(steps=[('binning_process', binning_process),
                     ('scaling', StandardScaler()),
                     ('Random_forest', RandomForestClassifier(n_estimators = 100, max_depth=6))])

rfcP_n.fit(X_train_n, y_train)

predict_rfc_n = rfcP_n.predict(X_test_n)
predict_proba_rfc_n = rfcP_n.predict_proba(X_test_n)

In [None]:
# Linear Regression

lrP_n = Pipeline(steps=[('binning_process', binning_process),
                     ('scaling', StandardScaler()),
                     ('Logistic_regression', lr)])

lrP_n.fit(X_train_n, y_train)

predict_lr_n = lrP_n.predict(X_test_n)
predict_proba_lr_n = lrP_n.predict_proba(X_test_n)

In [None]:
# Decision Tree

dtP_n = Pipeline(steps=[('binning_process', binning_process),
                     ('scaling', StandardScaler()),
                     ('Decision_Tree', dt)])

dtP_n.fit(X_train_n, y_train)

predict_dt_n = dtP_n.predict(X_test_n)
predict_proba_dt_n = dtP_n.predict_proba(X_test_n)

In [None]:
# Gradient Boosting

gbcP_n = Pipeline(steps=[('binning_process', binning_process),
                     ('scaling', StandardScaler()),
                     ('GBC', GradientBoostingClassifier())])

gbcP_n.fit(X_train_n, y_train)

predict_gbc_n = gbcP_n.predict(X_test_n)
predict_proba_gbc_n = gbcP_n.predict_proba(X_test_n)

In [None]:
AUCs_n = [metrics.roc_auc_score(y_test, predict_proba_rfc_n[:,1]), metrics.roc_auc_score(y_test, predict_proba_lr_n[:,1]), metrics.roc_auc_score(y_test, predict_proba_dt_n[:,1]), metrics.roc_auc_score(y_test, predict_proba_gbc_n[:,1])]
lifts_n = [lift(predict_proba_rfc_n, y_test), lift(predict_proba_lr_n, y_test), lift(predict_proba_dt_n, y_test), lift(predict_proba_gbc_n, y_test)]

In [None]:
Table_n = pd.DataFrame({'Model' : ["Random Forest", "Logistic Regression", "Decision Tree", "Gradient Boosting"], 'AUC' : AUCs_n, 'Lift on First Decile': lifts_n})
# Unsampled Results
Table_n

In [None]:
# Sampled Results
Table

# Exploring important features

In [None]:
# Feature Importance for sampled Data

In [None]:
plt.figure(figsize=(30, 7))

plt.subplot(1,4,1)
plt.title('Random Forest Feature Importances')
plt.bar(columns, rfcP[2].feature_importances_, color='b', alpha = 0.7)
plt.xticks(rotation=90)

plt.subplot(1,4,2)
plt.title('Logistic Regression Feature Coeffiecients (abs)')
plt.bar(columns, abs(lrP[2].coef_[0]), color='r', alpha = 0.7)
plt.xticks(rotation=90)

plt.subplot(1,4,3)
plt.title('Decision Tree Feature Importances')
plt.bar(columns, dtP[2].feature_importances_, color='g', alpha = 0.7)
plt.xticks(rotation=90)

plt.subplot(1,4,4)
plt.title('Gradient Boosting Classifier Feature Importances')
plt.bar(columns, gbcP[2].feature_importances_, color='y', alpha = 0.7)
plt.xticks(rotation=90)

plt.show()

From this it is clear that the most important features are Current offer length, Sports Last Activation months before 2019, DateDiff and whether or not they hold Sky Entertainment

### Feature Selection Random Forest

In [None]:
features_rfc = ['Curr_Offer_Length_Sports', 'Curr_Off_Len_null_ind', 'Ent_ind', 'Sports_LA_Mon_Before_2019_09_01', 'DateDiff_months']
X_train_rfcfs = X_train_r[features_rfc]
X_test_rfcfs = X_test_r[features_rfc]

In [None]:
binning_process_rfcfs = BinningProcess(features_rfc,
                                 categorical_variables=['Curr_Off_Len_null_ind', 'Ent_ind'])

In [None]:
rfcP_fs = Pipeline(steps=[('binning_process', binning_process_rfcfs),
                     ('scaling', StandardScaler()),
                     ('Random_Forest', RandomForestClassifier(n_estimators = 100, max_depth=6))])

rfcP_fs.fit(X_train_rfcfs, y_train_sampled)

predict_rfc_fs = rfcP_fs.predict(X_test_rfcfs)
predict_proba_rfc_fs = rfcP_fs.predict_proba(X_test_rfcfs)

In [None]:
print("AUC score for Random Forest Pipeline with 5 features is " + str(sklearn.metrics.roc_auc_score(y_test, predict_proba_rfc_fs[:,1])))
print(confusion_matrix(y_test, predict_rfc_fs))
print("It has a lift of "+str(lift(predict_proba_rfc_fs, y_test)))

### Feature Selected Logistic Regression

In [None]:
features_lr = ['Curr_Offer_Length_Sports', 'Curr_Off_Len_null_ind', 'Ent_ind', 'Sports_LA_Mon_Before_2019_09_01', 'DateDiff_months']
X_train_lrfs = X_train_r[features_lr]
X_test_lrfs = X_test_r[features_lr]

In [None]:
binning_process_lrfs = BinningProcess(features_lr,
                                 categorical_variables=['Curr_Off_Len_null_ind', 'Ent_ind'])

In [None]:
lrP_fs = Pipeline(steps=[('binning_process', binning_process_lrfs),
                     ('scaling', StandardScaler()),
                     ('Linear_regression', lr)])

lrP_fs.fit(X_train_lrfs, y_train_sampled)

predict_lr_fs = lrP_fs.predict(X_test_lrfs)
predict_proba_lr_fs = lrP_fs.predict_proba(X_test_lrfs)

In [None]:
print("AUC score for Logistic Regression Pipeline with only these 5 features is " + str(sklearn.metrics.roc_auc_score(y_test, predict_proba_lr_fs[:,1])))
print(confusion_matrix(y_test, predict_lr_fs))
print("It has a lift of "+str(lift(predict_proba_lr_fs, y_test)))

### Tuning this logistic regression model

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
grid_values_lr = {'penalty': ['l1', 'l2'],'C':[0.001,.009,0.01,.09,1,5,10,25], 'solver':['liblinear']}
tuned_lrP_fs = Pipeline(steps=[('binning_process', binning_process_lrfs),
                     ('scaling', StandardScaler()),
                     ('Grid_search', GridSearchCV(LogisticRegression(), param_grid=grid_values_lr ,scoring = 'roc_auc'))])

tuned_lrP_fs.fit(X_train_lrfs, y_train_sampled)

tuned_predict_lr_fs = tuned_lrP_fs.predict(X_test_lrfs)
tuned_predict_proba_lr_fs = tuned_lrP_fs.predict_proba(X_test_lrfs)

In [None]:
print("AUC score for the tuned Logistic Regression Pipeline with only these 5 features is " + str(sklearn.metrics.roc_auc_score(y_test, tuned_predict_proba_lr_fs[:,1])))
print(confusion_matrix(y_test, tuned_predict_lr_fs))
print("It has a lift of "+str(lift(tuned_predict_proba_lr_fs, y_test)))

In [None]:
tuned_lrP_fs[2].best_params_

### Feature Selected Decision Tree

In [None]:
features_dt = ['Curr_Offer_Length_Sports', 'Ent_ind', 'Sports_LA_Mon_Before_2019_09_01', 'DateDiff_months']
X_train_dtfs = X_train_r[features_dt]
X_test_dtfs = X_test_r[features_dt]

In [None]:
binning_process_dtfs = BinningProcess(features_dt,
                                 categorical_variables=['Ent_ind'])

In [None]:
dtP_fs = Pipeline(steps=[('binning_process', binning_process_dtfs),
                     ('scaling', StandardScaler()),
                     ('decision_tree', tree.DecisionTreeClassifier(max_depth=6))])

dtP_fs.fit(X_train_dtfs, y_train_sampled)

predict_dt_fs = dtP_fs.predict(X_test_dtfs)
predict_proba_dt_fs = dtP_fs.predict_proba(X_test_dtfs)

In [None]:
print("AUC score for the Decision Tree with 4 features is " + str(sklearn.metrics.roc_auc_score(y_test, predict_proba_dt_fs[:,1])))
print(confusion_matrix(y_test, predict_dt_fs))
print("It has a lift of "+str(lift(predict_proba_dt_fs, y_test)))

### Tuning the Decision Tree

In [None]:
tr_fs = Pipeline(steps=[('binning_process', binning_process_dtfs),
                     ('scaling', StandardScaler())])

X_train_dtfs_tr = tr_fs.fit_transform(X_train_dtfs, y_train_sampled)
X_test_dtfs_tr = tr_fs.transform(X_test_dtfs)

In [None]:
dt = tree.DecisionTreeClassifier()
grid_values_dt = {'max_depth': [3,4,5], 'min_samples_leaf': [1, 2, 4]}
grid_search = GridSearchCV(dt, param_grid = grid_values_dt)

grid_search.fit(X_train_dtfs_tr, y_train_sampled)
tuned_predict_dt_fs = grid_search.predict(X_test_dtfs_tr)
tuned_predict_proba_dt_fs = grid_search.predict_proba(X_test_dtfs_tr)

In [None]:
print("AUC score for the tuned Decision Tree with 4 features is " + str(sklearn.metrics.roc_auc_score(y_test, tuned_predict_proba_dt_fs[:,1])))
print(confusion_matrix(y_test, tuned_predict_dt_fs))
print("It has a lift of "+str(lift(tuned_predict_proba_dt_fs, y_test)))

In [None]:
grid_search.best_params_

### Feature Selected GBC

In [None]:
features_gbc = ['Curr_Offer_Length_Sports', 'Curr_Off_Len_null_ind', 'Ent_ind', 'Sports_LA_Mon_Before_2019_09_01', 'DateDiff_months']
X_train_gbcfs = X_train_r[features_gbc]
X_test_gbcfs = X_test_r[features_gbc]

In [None]:
binning_process_gbcfs = BinningProcess(features_gbc,
                                 categorical_variables=['Curr_Off_Len_null_ind', 'Ent_ind'])

In [None]:
gbcP_fs = Pipeline(steps=[('binning_process', binning_process_gbcfs),
                     ('scaling', StandardScaler()),
                     ('GBC', GradientBoostingClassifier())])

gbcP_fs.fit(X_train_gbcfs, y_train_sampled)

predict_gbc_fs = gbcP_fs.predict(X_test_gbcfs)
predict_proba_gbc_fs = gbcP_fs.predict_proba(X_test_gbcfs)

In [None]:
print("AUC score for the Gradient Boosted Classifier Pipeline with 5 features is " + str(sklearn.metrics.roc_auc_score(y_test, predict_proba_gbc_fs[:,1])))
print(confusion_matrix(y_test, predict_gbc_fs))
print("It has a lift of "+str(lift(predict_proba_gbc_fs, y_test)))

In [None]:
AUCs_fs = [sklearn.metrics.roc_auc_score(y_test, predict_proba_rfc_fs[:,1]), sklearn.metrics.roc_auc_score(y_test, tuned_predict_proba_lr_fs[:,1]), sklearn.metrics.roc_auc_score(y_test, tuned_predict_proba_dt_fs[:,1]), sklearn.metrics.roc_auc_score(y_test, predict_proba_gbc_fs[:,1])]
lifts_fs = [lift(predict_proba_rfc_fs, y_test), lift(tuned_predict_proba_lr_fs, y_test), lift(tuned_predict_proba_dt_fs, y_test), lift(predict_proba_gbc_fs, y_test)]

## Model, Feature and Sample Selection

In [None]:
plt.figure(figsize=(14, 7))
x_labels = ['Random Forest Classifier', 'Logistic Regression', 'Decision Tree', 'Gradient Boosting Classifier']

plt.subplot(1,2,1)
plt.xticks(rotation=90)
plt.scatter(x_labels, AUCs, color='r', alpha = 0.7)
plt.scatter(x_labels, AUCs_n, color='g', alpha = 0.7)
plt.scatter(x_labels, AUCs_fs, color='b', alpha = 0.7)
plt.legend(['Sampled','Unsampled', 'Sampled and Feature Selected'])
plt.xlabel('Model')
plt.ylabel('AUC')
plt.grid()
plt.title('AUC for each model')

plt.subplot(1,2,2)
plt.xticks(rotation=90)
plt.scatter(x_labels, lifts, color='r', alpha = 0.7)
plt.scatter(x_labels, lifts_n, color='g', alpha = 0.7)
plt.scatter(x_labels, lifts_fs, color='b', alpha = 0.7)
plt.legend(['Sampled','Unsampled', 'Sampled and Feature Selected'])
plt.xlabel('Model')
plt.ylabel('Lift')
plt.grid()
plt.title('Lift for each model')

plt.show()