In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import numpy as np
import pandas as pd
import scipy.stats as scs
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.dates as mdates

# model imports
# from sklearn.tree import DecisionTreeRegressor
# from sklearn.linear_model import LinearRegression
# from sklearn.ensemble import AdaBoostRegressor
# from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
# from sklearn.ensemble import GradientBoostingRegressor
# from sklearn.metrics import mean_squared_error, r2_score
# from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsClassifier as KNN

import xgboost as xgb

# imports from starter code by SRK
import os
import sys
import operator
from scipy import sparse
from sklearn import model_selection, preprocessing, ensemble
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

### Import data

In [2]:
data_path = "../data/"
train_file = data_path + "churn_train.csv"
test_file = data_path + "churn_test.csv"
train_df = pd.read_csv(train_file, parse_dates=['last_trip_date', 'signup_date'], infer_datetime_format=True)
test_df = pd.read_csv(test_file, parse_dates=['last_trip_date', 'signup_date'], infer_datetime_format=True)

today =pd.to_datetime('2014-07-01')
train_df['days_since_trip'] =  (today - train_df['last_trip_date']).dt.days
test_df['days_since_trip'] =  (today - test_df['last_trip_date']).dt.days

y_train_continuous = train_df['days_since_trip'].values
y_test_continuous = test_df['days_since_trip'].values

y_train_binary = (train_df['days_since_trip'].values > 30) * 1
y_test_binary = (test_df['days_since_trip'].values > 30) * 1


In [3]:
# Let us extract some features like year, month, day, hour from date columns #
train_df["signup_year"] = train_df["signup_date"].dt.year
test_df["signup_year"] = test_df["signup_date"].dt.year
train_df["signup_month"] = train_df["signup_date"].dt.month
test_df["signup_month"] = test_df["signup_date"].dt.month
train_df["signup_day"] = train_df["signup_date"].dt.day
test_df["signup_day"] = test_df["signup_date"].dt.day
train_df["signup_day_of_week"] = train_df["signup_date"].dt.dayofweek
test_df["signup_day_of_week"] = test_df["signup_date"].dt.dayofweek

train_df.drop(['last_trip_date','days_since_trip','signup_date'], axis=1, inplace=True)
test_df.drop(['last_trip_date','days_since_trip','signup_date'], axis=1, inplace=True)

print(train_df.shape)
print(test_df.shape)
print(y_train_binary.shape)
print(y_test_binary.shape)

(40000, 14)
(10000, 14)
(40000,)
(10000,)


In [4]:
# what's the churn rate?
sum(y_train_binary) / len(y_train_binary)

0.62419999999999998

### Which columns have nulls?

In [5]:
has_nulls = []
for column in train_df.columns:
    if train_df[column].isnull().sum() > 0:
        has_nulls.append(column)
        print("column {} has {} nulls".format(column, train_df[column].isnull().sum()))
        print(train_df[column].describe())
        print("")

column avg_rating_by_driver has 162 nulls
count    39838.000000
mean         4.777434
std          0.448088
min          1.000000
25%               NaN
50%               NaN
75%               NaN
max          5.000000
Name: avg_rating_by_driver, dtype: float64

column avg_rating_of_driver has 6528 nulls
count    33472.000000
mean         4.601697
std          0.614810
min          1.000000
25%               NaN
50%               NaN
75%               NaN
max          5.000000
Name: avg_rating_of_driver, dtype: float64

column phone has 319 nulls
count      39681
unique         2
top       iPhone
freq       27628
Name: phone, dtype: object





In [6]:
has_nulls = []
for column in test_df.columns:
    if test_df[column].isnull().sum() > 0:
        has_nulls.append(column)
        print("column {} has {} nulls".format(column, test_df[column].isnull().sum()))
        print(test_df[column].describe())
        print("")
has_nulls

column avg_rating_by_driver has 39 nulls
count    9961.000000
mean        4.781056
std         0.440871
min         1.000000
25%              NaN
50%              NaN
75%              NaN
max         5.000000
Name: avg_rating_by_driver, dtype: float64

column avg_rating_of_driver has 1594 nulls
count    8406.000000
mean        4.601011
std         0.627343
min         1.000000
25%              NaN
50%              NaN
75%              NaN
max         5.000000
Name: avg_rating_of_driver, dtype: float64

column phone has 77 nulls
count       9923
unique         2
top       iPhone
freq        6954
Name: phone, dtype: object





['avg_rating_by_driver', 'avg_rating_of_driver', 'phone']

In [7]:
# Replacing nulls with means and adding their null status as a column
for column in ['avg_rating_by_driver', 'avg_rating_of_driver']:
    train_df[column+'_null'] = train_df[column].isnull()*1
    train_df[column] = train_df[column].fillna(train_df[column].mean())
    test_df[column+'_null'] = test_df[column].isnull()*1
    test_df[column] = test_df[column].fillna(test_df[column].mean())

# preserving the columns with missing phone info
train_df['phone'] = train_df['phone'].fillna('Other')
test_df['phone'] = test_df['phone'].fillna('Other')

# changing to int 
train_df['luxury_car_user'] = train_df['luxury_car_user']*1
test_df['luxury_car_user'] = test_df['luxury_car_user']*1

In [8]:
train_df.head()

Unnamed: 0,avg_dist,avg_rating_by_driver,avg_rating_of_driver,avg_surge,city,phone,surge_pct,trips_in_first_30_days,luxury_car_user,weekday_pct,signup_year,signup_month,signup_day,signup_day_of_week,avg_rating_by_driver_null,avg_rating_of_driver_null
0,6.94,5.0,5.0,1.0,Astapor,Android,0.0,0,0,100.0,2014,1,12,6,0,0
1,8.06,5.0,5.0,1.0,Astapor,Android,0.0,2,1,0.0,2014,1,25,5,0,0
2,21.5,4.0,4.601697,1.0,Winterfell,iPhone,0.0,1,1,100.0,2014,1,2,3,0,1
3,9.46,5.0,4.601697,2.75,Winterfell,Android,100.0,1,0,100.0,2014,1,9,3,0,1
4,13.77,5.0,4.601697,1.0,Winterfell,iPhone,0.0,0,0,100.0,2014,1,31,4,0,1


In [9]:
# Changing categorical to dummies
obj_cols = [column for column in train_df.columns if train_df[column].dtype == 'O']
for column in obj_cols:
    dummies_df = pd.get_dummies(train_df[column])
    train_df = pd.concat([train_df, dummies_df], axis=1)
    dummies_df = pd.get_dummies(test_df[column])
    test_df = pd.concat([test_df, dummies_df], axis=1)

In [10]:
train_df.head()

Unnamed: 0,avg_dist,avg_rating_by_driver,avg_rating_of_driver,avg_surge,city,phone,surge_pct,trips_in_first_30_days,luxury_car_user,weekday_pct,...,signup_day,signup_day_of_week,avg_rating_by_driver_null,avg_rating_of_driver_null,Astapor,King's Landing,Winterfell,Android,Other,iPhone
0,6.94,5.0,5.0,1.0,Astapor,Android,0.0,0,0,100.0,...,12,6,0,0,1.0,0.0,0.0,1.0,0.0,0.0
1,8.06,5.0,5.0,1.0,Astapor,Android,0.0,2,1,0.0,...,25,5,0,0,1.0,0.0,0.0,1.0,0.0,0.0
2,21.5,4.0,4.601697,1.0,Winterfell,iPhone,0.0,1,1,100.0,...,2,3,0,1,0.0,0.0,1.0,0.0,0.0,1.0
3,9.46,5.0,4.601697,2.75,Winterfell,Android,100.0,1,0,100.0,...,9,3,0,1,0.0,0.0,1.0,1.0,0.0,0.0
4,13.77,5.0,4.601697,1.0,Winterfell,iPhone,0.0,0,0,100.0,...,31,4,0,1,0.0,0.0,1.0,0.0,0.0,1.0


In [11]:
train_df.drop(['city','phone'], axis=1, inplace=True)
test_df.drop(['city','phone'], axis=1, inplace=True)

In [12]:
test_df.head()

Unnamed: 0,avg_dist,avg_rating_by_driver,avg_rating_of_driver,avg_surge,surge_pct,trips_in_first_30_days,luxury_car_user,weekday_pct,signup_year,signup_month,signup_day,signup_day_of_week,avg_rating_by_driver_null,avg_rating_of_driver_null,Astapor,King's Landing,Winterfell,Android,Other,iPhone
0,2.48,5.0,5.0,1.0,0.0,2,1,100.0,2014,1,6,0,0,0,0.0,0.0,1.0,1.0,0.0,0.0
1,10.81,5.0,5.0,1.0,0.0,3,1,100.0,2014,1,6,0,0,0,0.0,0.0,1.0,0.0,0.0,1.0
2,12.95,5.0,5.0,1.0,0.0,1,1,100.0,2014,1,19,6,0,0,1.0,0.0,0.0,1.0,0.0,0.0
3,3.92,5.0,4.601011,1.0,0.0,0,0,0.0,2014,1,9,3,0,1,0.0,0.0,1.0,0.0,0.0,1.0
4,1.46,5.0,4.5,1.0,0.0,2,0,100.0,2014,1,7,1,0,0,1.0,0.0,0.0,0.0,0.0,1.0


In [13]:
train_df.describe()

Unnamed: 0,avg_dist,avg_rating_by_driver,avg_rating_of_driver,avg_surge,surge_pct,trips_in_first_30_days,luxury_car_user,weekday_pct,signup_year,signup_month,signup_day,signup_day_of_week,avg_rating_by_driver_null,avg_rating_of_driver_null,Astapor,King's Landing,Winterfell,Android,Other,iPhone
count,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0
mean,5.791302,4.777434,4.601697,1.074956,8.857342,2.2807,0.37615,60.874382,2014.0,1.0,16.5395,3.31095,0.00405,0.1632,0.3309,0.2036,0.4655,0.301325,0.007975,0.6907
std,5.708056,0.44718,0.562407,0.222427,20.014008,3.811289,0.484424,37.089619,0.0,0.0,8.749512,1.87976,0.063511,0.369553,0.470543,0.40268,0.498815,0.458839,0.088947,0.462211
min,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,2014.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.42,4.7,4.5,1.0,0.0,0.0,0.0,33.3,2014.0,1.0,9.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,3.88,5.0,4.7,1.0,0.0,1.0,0.0,66.7,2014.0,1.0,17.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,6.93,5.0,5.0,1.05,8.3,3.0,1.0,100.0,2014.0,1.0,24.0,5.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0
max,160.96,5.0,5.0,8.0,100.0,125.0,1.0,100.0,2014.0,1.0,31.0,6.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


##### Data is all numerical now.  We are ready for some model building.
### Cross Validation - Binary Classifier

In [14]:
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, num_rounds=1000):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.05
    param['max_depth'] = 6
    param['silent'] = 1
    param['num_class'] = 2
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['nthread'] = 4
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=50, verbose_eval=False)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest)
    return pred_test_y, model

In [15]:
#Testing with XGB
X_train = train_df.values
X_test = test_df.values
y_train = y_train_binary
y_test = y_test_binary

kf = model_selection.KFold(n_splits=10)

#dev is the current fold's train, and val is current fold's validation
cv_scores_xgb = []
for dev_index, val_index in kf.split(range(X_train.shape[0])):    # excluded since we are not stratefying ,train_y):
        print('.', end='', flush=True)
        dev_X, val_X = X_train[dev_index,:], X_train[val_index,:]
        dev_y, val_y = y_train[dev_index], y_train[val_index]
        preds, model = runXGB(dev_X, dev_y, val_X, val_y)
        cv_scores_xgb.append([log_loss(val_y, preds), accuracy_score(val_y, (preds[:,1] > 0.5)*1)])
cv_scores_xgb = np.array(cv_scores_xgb)
print("")
print("XGB mean score logloss {}, accuracy {}".format(np.mean(cv_scores_xgb[:,0]), np.mean(cv_scores_xgb[:,1])))

..........
XGB mean score logloss 0.445113198807626, accuracy 0.7930249999999999


In [16]:
test_df.shape

(10000, 20)

In [22]:
# tryin with random forest

# cv_scores_rf = []
# for dev_index, val_index in kf.split(range(X_train.shape[0])):    # excluded since we are not stratefying ,train_y):
#         print('.', end='', flush=True)
#         dev_X, val_X = X_train[dev_index,:], X_train[val_index,:]
#         dev_y, val_y = y_train[dev_index], y_train[val_index]
#         model = RF(dev_X, dev_y, val_X, val_y)
#         cv_scores_rf.append([log_loss(val_y, preds), accuracy_score(val_y, (preds[:,1] > 0.5)*1)])
# cv_scores_rf = np.array(cv_scores_xgb)
# print("")
# print("RF mean score logloss {}, accuracy {}".format(np.mean(cv_scores_rf[:,0]), np.mean(cv_scores_rf[:,1])))

### Creating final model and prediction

In [18]:
def standard_confusion_matrix(y_true, y_predict):
    # requires y_true and y_predict to be np arrays
    # returns the 
    tp = np.sum((y_true == 1) & (y_predict == 1))
    fp = np.sum((y_true == 0) & (y_predict == 1))
    fn = np.sum((y_true == 1) & (y_predict == 0))
    tn = np.sum((y_true == 0) & (y_predict == 0))
    return np.array([[tp,fp],[fn,tn]])

In [19]:
preds, model = runXGB(X_train, y_train, X_test, num_rounds=1000)
pred = (preds[:,1] > 0.5)*1

In [20]:
standard_confusion_matrix(y_test, pred)

array([[5353, 1277],
       [ 875, 2495]])

In [21]:
accuracy_score(y_test,pred)

0.78480000000000005