## Setup

In [None]:
%matplotlib inline
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, roc_curve, f1_score
from sklearn.model_selection import KFold
from sklearn.feature_selection import SelectPercentile
from analytics_utils.util.qubole import run_hive_as_spark, run_spark
from analytics_utils.ml.preprocessing import StringToSparse
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import copy
import matplotlib.pyplot as plt
from matplotlib.pyplot import show
import numpy as np
import pandas as pd
import scipy.sparse as sp
import random
from sklearn.feature_selection import VarianceThreshold
import re
import time
#import xgboost as xgb
from __builtin__ import round
from IPython.display import display


plt.switch_backend('agg')
plt.style.use("ggplot")

core_consumer_token = 'SCj4GyFXjxSvrPass6rhXx96ZNHHa7mtqh6nhj3qvKa3xzBrzKpe4SyEPVEPxfkF'
ids_cluster = 'spark_default'
spark_cli = '--num-executors 80 --executor-cores 5  --executor-memory 18G --conf spark.rdd.compress=true'

## Read In Data

In [None]:
from analytics_utils.util.qubole import run_hive

query = """
SELECT * FROM features.fancyfeast_julydata
WHERE rand() < 0.5;
"""

df = run_hive(query, job_name = "fancyfest_sample", return_data=True, token = core_consumer_token, cluster = ids_cluster)

In [None]:
##standardize column names
df.columns = ['hhid', 'flg', 'app_day_count', 'app_event_count']

Data Summaries

In [None]:
print "The total number of records is 2200216"
print "The positive flg ratio in this subsample is " + str(100.0*sum(df.flg)/df.shape[0]) + "%"

In [None]:
print "The positive flg ratio in the test set is " + str(100.0*sum(df.flg)/df.shape[0]) + "%"

In [None]:
df.iloc[1:10]

Replace Semi-Colon

In [None]:
cols = [col for col in df.columns if col not in ["hhid", "flg"]]
'''dataframe with columns 'app_day_count' and 'app_event_count'''
X = df.reset_index(drop=True).ix[:, cols]
'''label'''
Y = df.reset_index(drop=True).flg

In [None]:
def semi_colon_remove(X):
    for i in range(0, 2):
        a = X.iloc[:,i]
        pointer = -1
        for row in a:
            pointer += 1
            for m in re.finditer(':', row):
                if row[m.start()+1] not in ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']:
                    X.iloc[:, i][pointer] = a[pointer][:m.start()] + "*" + a[pointer][m.start()+1:]
    return X

In [None]:
X = semi_colon_remove(X)

## Split And Transform Data (StringToSparse)

In [None]:
def split_transform(X, Y, testset_ratio, seed):
    '''data split'''
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=testset_ratio, random_state=seed)
    '''data transform(string to sparse)'''
    cols = list(X)
    X_train_sparse = []
    X_test_sparse = []
    colnames = []
    for col in cols:
        delim_vec = StringToSparse(delim_1 = ",", delim_2 = ":")
        X_train_sparse.append(delim_vec.fit_transform(X_train[col].as_matrix()))
        X_test_sparse.append(delim_vec.transform(X_test[col].as_matrix()))
        colnames_temp = [i + "_" + col for i in delim_vec.transformer.get_feature_names()]
        colnames.append(colnames_temp)
    X_train_sparse = sp.csr_matrix(sp.hstack(X_train_sparse))
    X_test_sparse = sp.csr_matrix(sp.hstack(X_test_sparse))
    Y_train = np.array(Y_train)
    Y_test = np.array(Y_test)
    colnames = np.array(colnames).flatten()
    return X_train_sparse, X_test_sparse, Y_train, Y_test, colnames

In [None]:
testset_ratio=0.3
seed=820
X_train_sparse, X_test_sparse, Y_train, Y_test, colnames = split_transform(X, Y, testset_ratio=testset_ratio, seed=seed)

## Feature Selection

In [None]:
def feat_selection(X_train, X_test, Y_train, colnames, stability_selection = False):
    %matplotlib inline
    from sklearn.feature_selection import SelectKBest, chi2
    from sklearn.linear_model import RandomizedLogisticRegression
    from IPython.display import display
    import numpy as np
    import scipy as sp
    
    print "Total # of parameters: " + str(len(colnames))
    
    '''STEP1: Remove Features with 0 Variance'''
    print "STEP1: Remove Features with 0 Variance"
    selector1 = VarianceThreshold(threshold=0)
    X1 = selector1.fit_transform(X_train)
    colnames1 = np.array(colnames)[selector1.get_support()]
    print "Total # of parameters: " + str(len(colnames1))

    '''STEP2: Remove Sparse Features'''
    print "STEP2: Remove Sparse Features"
    occurence_list = sp.csr_matrix.getnnz(X1, axis=0)
    threshold = sum(occurence_list)/float(len(occurence_list))/10
    keep_index = np.where(occurence_list >= threshold)[0].tolist()
    X2 = X1[:, keep_index]
    colnames2 = colnames1[keep_index]
    print "Total #of parameters: "+ str(len(colnames2))
    
    '''STEP3: Filter Method: Univariate Selection using Chi2'''
    print "STEP3: Filter Method: Univariate Selection using Chi2"
    cutoff = (chi2(X1, Y_train)[1] < 0.05).sum()
    
    plt.plot(sorted(chi2(X1, Y_train)[1]))
    plt.axvline(x=cutoff)
    plt.show()
    
    selector3 = SelectKBest(score_func=chi2, k=cutoff)
    X3 = selector3.fit_transform(X2, Y_train)
    colnames3 = np.array(colnames2)[selector3.get_support()]    
    print "Total # of parameters: " + str(len(colnames3))    
    
    if(stability_selection == True):
        '''STEP4: Stability Selection using Randomized Logistic Regression'''
        print "STEP4: Stability Selection using Randomized Logistic Regression"
        selector4 = RandomizedLogisticRegression(selection_threshold=0.0, random_state=seed)
        selector4.fit(X3, Y_train)
        X4 = selector4.transform(X3.toarray())
        colnames4 = np.array(colnames3)[selector4.get_support()]    
        print "Features sorted by their score:"
        non_zero_features = selector4.scores_[selector4.scores_ != 0]
        print sorted(zip(map(lambda x: round(x, 4), non_zero_features), colnames4), reverse=True)
        print "Total # of parameters: " + str(len(colnames4)) 
    
    '''FINAL STEP: Transform X_test'''
    print "start test transformation"
    X_test1 = selector1.transform(X_test)
    print "finish 1st transformation"
    X_test2 = X_test1[:, keep_index]
    print "finish 2nd transformation"
    X_test3 = selector3.transform(X_test2)
    print "finish 3rd transformation"
    #X_test4 = selector4.transform(X_test3)
    #print "finish 4th transformation"
    #return X4, X_test4, colnames4
    return X3, X_test3, colnames3

In [None]:
from sklearn.feature_selection import chi2
pcorr = chi2(X_train_sparse, Y_train)[1]
sparsity = sp.csr_matrix.getnnz(X_train_sparse, axis=0)[::-1]/float(X_train_sparse.shape[0])*100.0

In [None]:
twoway_df = pd.DataFrame({'pcorr': pcorr, 'sparsity': sparsity})

In [None]:
def feat_selection(X_train, X_test, Y_train, colnames, stability_selection = False):
    %matplotlib inline
    from sklearn.feature_selection import SelectKBest, chi2
    from sklearn.linear_model import RandomizedLogisticRegression
    from IPython.display import display
    
    print "Total # of parameters: " + str(len(colnames))
    
    '''STEP1: Remove Features with 0 Variance'''
    print "STEP1: Remove Features with 0 Variance"
    selector1 = VarianceThreshold(threshold=0)
    X1 = selector1.fit_transform(X_train)
    colnames1 = np.array(colnames)[selector1.get_support()]
    print "Total # of parameters: " + str(len(colnames1))

    '''STEP2: Filter Method: Univariate Selection using Chi2'''
    print "STEP2: Filter Method: Univariate Selection using Chi2"
    #cutoff = (chi2(X1, Y_train)[1] < 0.05).sum()
    
    #plt.plot(sorted(chi2(X1, Y_train)[1]))
    #plt.axvline(x=cutoff)
    #plt.show()
    
    chi2(X1, Y_train)[1]
    selector2 = SelectKBest(score_func=chi2, k=5)
    X2 = selector2.fit_transform(X1, Y_train)
    colnames2 = np.array(colnames1)[selector2.get_support()]    
    print "Total # of parameters: " + str(len(colnames2))    
    
    if(stability_selection == True):
        '''STEP3: Stability Selection using Randomized Logistic Regression'''
        print "STEP3: Stability Selection using Randomized Logistic Regression"
        selector3 = RandomizedLogisticRegression(selection_threshold=0.0, random_state=seed)
        selector3.fit(X2, Y_train)
        X3 = selector3.transform(X2.toarray())
        colnames3 = np.array(colnames2)[selector3.get_support()]    
        print "Features sorted by their score:"
        non_zero_features = selector3.scores_[selector3.scores_ != 0]
        print sorted(zip(map(lambda x: round(x, 4), non_zero_features), colnames3), reverse=True)
        print "Total # of parameters: " + str(len(colnames3)) 
    
    '''FINAL STEP: Transform X_test'''
    print "start test transformation"
    X_test1 = selector1.transform(X_test)
    print "finish 1st transformation"
    X_test2 = selector2.transform(X_test1)
    print "finish 2nd transformation"
    #X_test3 = selector3.transform(X_test2)
    #return X3, X_test3, colnames3
    return X2, X_test2, colnames2

## Model Training

Logistic Regression

In [None]:
## train models ##
import copy
from analytics_utils.ml.modeling import train_class
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
cv = KFold(n_splits=5, shuffle=True)

## sparse models ##
grid = {
    "model__penalty":['l1','l2']
}

pipe = Pipeline([
    ('model',LogisticRegression(n_jobs=8))
])

predictions, model_logistic = train_class(model=copy.deepcopy(pipe), grid=grid, train=X_train_sparse, target=Y_train, \
                                          cv=cv)
preds_logistic = model_logistic.predict_prob(X_test_sparse)
print roc_auc_score(Y_test, preds_logistic[:, 1])

In [None]:
roc_auc_score(Y_test, preds_logistic[:, 1])

In [None]:
preds_logistic = model_logistic.predict_proba(X_test_sparse)
#print roc_auc_score(Y_test, preds_logistic[:, 1])

In [None]:
'''Use All Features'''
predictions, model_logistic = train_class(model=copy.deepcopy(pipe), grid=grid, train=X_train_sparse, target=Y_train, \
                                          cv=cv)
preds_logistic = model_logistic.predict_proba(X_test_sparse)
print roc_auc_score(Y_test, preds_logistic[:, 1])

SVM

In [None]:
from sklearn import svm
model_svm = svm.linearSVC()
model_svm.fit(X_train_final, Y_train)
preds_svm = model_svm.predict_prob(X_test_final)
print roc_auc_score(Y_test, preds_svm)

Random Forest

In [None]:
model_random_forest = RandomForestClassifier(n_jobs=2, random_state=0)
model_random_forest.fit(X_train_final, Y_train)
preds_random_forest = model_random_forest.predict_proba(X_test_final)
roc_auc_score(Y_test, preds_random_forest[:, 1])

XGBoost

In [None]:
dtrain = xgb.DMatrix(X_train_final, Y_train)
dtest = xgb.DMatrix(X_test_final, Y_test)

xgb_params = {"n_jobs":-1,
              "eval_metric": "auc",
              "objective": "binary:logistic",
              "eta": 0.1,
              "max_depth": 6,
              "min_child_weight": 10,
              "gamma": 0.7,
              "subsample": 0.7,
              "colsample_bytree" : 0.6,
              "alpha": 0.1,
              "lambda": 10,
              "max_delta_step": 1,
              "seed": 820}

eval_set  = [(dtrain, "train"), (dtest, "test")]
model_xgb = xgb.train(params=xgb_params, dtrain=dtrain, num_boost_round=100, evals=eval_set, verbose_eval=10, early_stopping_rounds = 1000)
preds_xgb = model_xgb.predict(dtest)

Evaluation Plot

In [None]:
eval_plot(label=Y_test, pred=preds_random_forest[:, 1], top_segments=1, segments=10)

In [None]:
def eval_plot(label, pred, top_segments, segments):
    '''Prediction Distribution'''
    print "Prediction Distribution"
    plt.hist(pred)
    
    '''AUC & Precision Recall'''
    print "AUC & Precision Recall"
    auc_precision_recall(label, pred)
    
    '''Lift Table'''
    print "Lift Table"
    lift_table(label, pred, top_segments, segments)
    
    '''Score Distribution'''
    print "Score Distribution"
    score_distribution(label, pred)

## Model Evaluation

AUC & Precision Recall

In [None]:
def auc_precision_recall(label, pred):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4.5))
    
    '''ROC Plot'''
    auc = roc_auc_score(label, pred)
    fpr, tpr, thresh = roc_curve(label, pred)
    ax1.plot(fpr, tpr, label="ROC, auc="+str(auc))
    ax1.plot([0, 1], [0, 1], 'k--')  # random predictions curve
    ax1.axis(xmin = 0.0, xmax = 1.0, ymin = 0.0, ymax = 1.0)
    ax1.set_xlabel('False Positive Rate or (1 - Specifity)')
    ax1.set_ylabel('True Positive Rate or (Sensitivity)')
    ax1.set_title('Receiver Operating Characteristic')
    ax1.legend(loc="lower right")
    
    '''Precision Recall Plot'''
    #compute average precision rescore
    from sklearn.metrics import average_precision_score
    average_precision = average_precision_score(label, pred)
    print('Average precision-recall score: {0:0.2f}'.format(average_precision))
    from sklearn.metrics import precision_recall_curve
    precision, recall, _ = precision_recall_curve(label, pred)
    ax2.step(recall, precision, color='b', alpha=0.2, where='post')
    ax2.fill_between(recall, precision, step='post', alpha=0.2, color='b')
    ax2.set_xlabel('Recall')
    ax2.set_ylabel('Precision')
    ax2.axis(xmin = 0.0, xmax = 1.0, ymin = 0.0, ymax = 1.05)
    ax2.set_title('2-class Precision-Recall curve: AP={0:0.2f}'.format(average_precision))

    fig.tight_layout()
    plt.show()

Top Precision & Lift Table & Cumulative Recall Table 

In [None]:
def lift_table(label, pred, top_segments, segments):
    df = pd.DataFrame({"label": label, "pred": pred})
    df_sorted = df.sort_values('pred', ascending = False).reset_index().nlargest(int(round(0.2*len(df))),'pred')
    segment_size = np.round(len(df_sorted)/segments)
    pos_label = []
    seg_churners_cum = []
    for i in (np.array(range(segments)) + 1)*segment_size:
        pos_label.append(df_sorted.loc[i - segment_size:i-1, 'label'].values.sum())
    segment_percentile = np.linspace(0, top_segments, segments+1)*100
    segment_percentile_1 = np.delete(segment_percentile, 0)
    segment_percentile_2 = np.delete(segment_percentile, len(segment_percentile)-1)
    
    '''Top Precision'''
    top_precision = df_sorted['label'].mean()
    print "The top" + "{:3.0f}%".format(top_segments*100) + " segments has" + "{:3.0f}%".format(top_precision*100) + " precision."
    
    '''Precision'''
    segment_precision = [round(float(i)/segment_size, 3) for i in pos_label]
    baseline = round(float(df_sorted['label'].sum())/len(df_sorted), 3)
    
    '''Cumulative Recall'''
    cumulative_recall = [round(i, 3) for i in np.array(pos_label).cumsum()/float(df_sorted['label'].sum())]
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4.5))
    ax1.plot(segment_percentile_1, segment_precision, 'b-', alpha = 0.5)
    ax1.axhline(y=baseline, c="red", linewidth=0.5, zorder=0, label = "Baseline")
    vals = ax1.get_yticks()
    ax1.set_yticklabels(['{:3.0f}%'.format(x*100) for x in vals])
    ax1.set_xlabel('Segment Percentile')
    ax1.set_ylabel('Segment Precision')
    ax1.set_title('Proportion of Buyers by Segments')
    ax1.legend()
    
    width = segment_percentile[1]-segment_percentile[0]
    ax2.bar(segment_percentile_2, cumulative_recall, color = 'red', alpha = 0.5, width=width, align='edge')
    vals2 = ax2.get_yticks()
    ax2.set_yticklabels(['{:3.0f}%'.format(x*100) for x in vals2])
    ax2.set_xlabel('Segment Percentile')
    ax2.set_ylabel('Cumulative Recall')
    ax2.set_title('Cumulative Proportion of Buyers')

    fig.tight_layout()
    plt.show()

Score Distribution

In [None]:
def score_distribution(label, pred):
    import bisect
    import pylab as pl
    
    interval = pl.frange(0,1,0.01)
    df = pd.DataFrame({'label':label, 'pred':pred})
    df_0 = df[df['label'] == 0]
    df_1 = df[df['label'] == 1]

    '''Buyers'''
    buckets_buyers = dict((i,0) for i in interval)
    for i in df_1['pred']:
        bucket = interval[bisect.bisect_left(interval, i)]
        buckets_buyers[bucket] += 1
    df_buyers = pd.DataFrame(buckets_buyers.items())
    df_buyers.columns = ['Interval', 'Buyers']
    df_buyers['Buyers'] = df_buyers['Buyers']/len(df_1)*100
    df_buyers = df_buyers.sort_values('Interval')
    
    '''Non-Buyers'''
    buckets_non_buyers = dict((i,0) for i in interval)
    for i in df_0['pred']:
        bucket = interval[bisect.bisect_left(interval, i)]
        buckets_non_buyers[bucket] += 1
    df_non_buyers = pd.DataFrame(buckets_non_buyers.items())
    df_non_buyers.columns = ['Interval', 'Non-Buyers']
    df_non_buyers['Non-Buyers'] = df_non_buyers['Non-Buyers']/len(df_0)*100
    df_non_buyers = df_non_buyers.sort_values('Interval')
    
    fig, ax = plt.subplots()
    ax.plot(df_buyers['Interval'], df_buyers['Buyers'], color = 'red', alpha = 0.5)
    ax.plot(df_non_buyers['Interval'], df_non_buyers['Non-Buyers'], color = 'blue', alpha = 0.5)
    vals = ax.get_yticks()
    ax.set_yticklabels(['{:3.0f}%'.format(x) for x in vals])
    ax.legend(['Buyers', 'Non-Buyers'])
    ax.set_xlabel('Score')
    ax.set_ylabel('Frequency')
    ax.set_title('Score Distribution')
    
    plt.show()

Overall Evaluation Plot

In [None]:
def eval_plot(label, pred, top_segments, segments):
    '''Prediction Distribution'''
    print "Metric 1 : Prediction Distribution"
    plt.hist(pred)
    
    '''AUC & Precision Recall'''
    print "Metric 2 : AUC & Precision Recall"
    auc_precision_recall(label, pred)
    
    '''Lift Table'''
    print "Metric 3 : Lift Table"
    lift_table(label, pred, top_segments, segments)
    
    '''Score Distribution'''
    print "Metric 4 : Score Distribution"
    score_distribution(label, pred)