In [260]:

# Importing libraries
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import roc_curve, auc

In [261]:
#Loading data
with open('uber_data_challenge.json') as data_file:
    retention_data = json.load(data_file)
retention_data = pd.DataFrame(retention_data)
retention_data.shape

(50000, 12)

In [248]:
#Exploring data 
retention_data.head(10)

Unnamed: 0,avg_dist,avg_rating_by_driver,avg_rating_of_driver,avg_surge,city,last_trip_date,phone,signup_date,surge_pct,trips_in_first_30_days,uber_black_user,weekday_pct
0,3.67,5.0,4.7,1.1,King's Landing,2014-06-17,iPhone,2014-01-25,15.4,4,True,46.2
1,8.26,5.0,5.0,1.0,Astapor,2014-05-05,Android,2014-01-29,0.0,0,False,50.0
2,0.77,5.0,4.3,1.0,Astapor,2014-01-07,iPhone,2014-01-06,0.0,3,False,100.0
3,2.36,4.9,4.6,1.14,King's Landing,2014-06-29,iPhone,2014-01-10,20.0,9,True,80.0
4,3.13,4.9,4.4,1.19,Winterfell,2014-03-15,Android,2014-01-27,11.8,14,False,82.4
5,10.56,5.0,3.5,1.0,Winterfell,2014-06-06,iPhone,2014-01-09,0.0,2,True,100.0
6,3.95,4.0,,1.0,Astapor,2014-01-25,Android,2014-01-24,0.0,1,False,100.0
7,2.04,5.0,5.0,1.0,Winterfell,2014-01-29,iPhone,2014-01-28,0.0,2,False,100.0
8,4.36,5.0,4.5,1.0,Winterfell,2014-02-01,Android,2014-01-21,0.0,2,False,100.0
9,2.37,5.0,,1.0,Winterfell,2014-01-05,Android,2014-01-03,0.0,1,False,0.0


In [249]:
retention_data.to_csv('retention_data.csv', sep=',')

In [250]:
#Checking missing values
retention_data.isnull().sum()


avg_dist                     0
avg_rating_by_driver       201
avg_rating_of_driver      8122
avg_surge                    0
city                         0
last_trip_date               0
phone                      396
signup_date                  0
surge_pct                    0
trips_in_first_30_days       0
uber_black_user              0
weekday_pct                  0
dtype: int64

In [262]:
#Since for avg_rating by driver and trips in first 30 days the number of mising values is <1% i am deleting the rows with missing
#values in these columns. For the avg rating of driver variable, i am creating a new variable which denotes if value is missing. 
#Then i am filling avg rating of driver  with median.
retention_data['rating_of_driver_miss']=np.where(retention_data['avg_rating_of_driver']>-1, 0, 1)
retention_data['avg_rating_of_driver'].fillna(retention_data['avg_rating_of_driver'].median(), inplace = True)
#Since we filled the avg rating of driver field , we can remove rows with all oher na values
retention_data= retention_data.dropna().reset_index(drop = True)


In [263]:
#Creating day of week of signup from signup_date variable and then removing it


retention_data['signup_date']=pd.to_datetime(retention_data['signup_date'])
retention_data['weekday']=retention_data['signup_date'].dt.dayofweek
del retention_data['signup_date']
retention_data.dtypes

avg_dist                  float64
avg_rating_by_driver      float64
avg_rating_of_driver      float64
avg_surge                 float64
city                       object
last_trip_date             object
phone                      object
surge_pct                 float64
trips_in_first_30_days      int64
uber_black_user              bool
weekday_pct               float64
rating_of_driver_miss       int32
weekday                     int64
dtype: object

In [264]:

#Creating Response variable retention_ind. If last_trip_dt >= '2014-06-01' then retention_ind=1 else 0
retention_data['retention_ind']=retention_data.last_trip_date >='2014-06-01'
del retention_data['last_trip_date']
retention_data['retention_ind'].mean()

0.37718090920131159

In [265]:
#Pre-processing- Creating dummy variables for phone and city. 
#Creating  training and test datasets Using 80:20 split
#Sice we see that rating 5 has actually a negative impact on retention, we keep a separate indicator for that


from sklearn import preprocessing
le = preprocessing.LabelEncoder()
#retention_data['city'] = le.fit_transform(retention_data['city'])
#retention_data['phone'] = le.fit_transform(retention_data["phone"])#Dividing into training and test data
retention_data['city_num'] = np.where(retention_data['city']=='Astapor', 0,np.where(retention_data['city']=='Winterfell', 1, 2) )
retention_data['phone_num'] = retention_data.phone=='Android'#Dividing into training and test data
retention_data['Highest_rating_driver']=retention_data.avg_rating_of_driver >4.5
retention_data['Highest_rating_customer']=retention_data.avg_rating_by_driver >4.5
#retention_data['avg_rating_of_driver']=np.where(retention_data.avg_rating_of_driver >4.5,4.5,retention_data['avg_rating_of_driver'])
#retention_data['avg_rating_by_driver']=np.where(retention_data.avg_rating_by_driver >4,4.5,retention_data['avg_rating_by_driver'])
del retention_data['city']
del retention_data['phone']


from sklearn.cross_validation import train_test_split

y = retention_data["retention_ind"]
X = retention_data.drop('retention_ind', axis = 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)
X.dtypes




avg_dist                   float64
avg_rating_by_driver       float64
avg_rating_of_driver       float64
avg_surge                  float64
surge_pct                  float64
trips_in_first_30_days       int64
uber_black_user               bool
weekday_pct                float64
rating_of_driver_miss        int32
weekday                      int64
city_num                     int32
phone_num                     bool
Highest_rating_driver         bool
Highest_rating_customer       bool
dtype: object

In [266]:


# Using Logistic Regression

from sklearn.linear_model import LogisticRegression
from numpy import arange
C_array=arange(0.1, 1.1, 0.1)
# Initialize logistic regression model




In [267]:
# Cross validation for Logistic Regression
from sklearn.metrics import roc_auc_score
AUC_score_max=0
C_max=1
for C in C_array:
    Cross_validation_scores = cross_val_score(LogisticRegression(C=C), X_train, y_train, scoring='roc_auc', cv=10)
    AUC_score=Cross_validation_scores.mean()
    if AUC_score_max<AUC_score:
        C_max=C
        AUC_score_max=AUC_score
print(C_max)


#Since we had a retention rate of 38% in our data, we can always say retention_ind=0 and still get 62% accuracy.

0.2


In [268]:
# Calculating performance of logistic on test set
from sklearn.metrics import roc_auc_score
model_logistic=LogisticRegression(C=C_max)
model_logistic.fit(X_train, y_train)
AUC_max_train=roc_auc_score(y_train, model_logistic.predict(X_train))
AUC_max_test=roc_auc_score(y_test, model_logistic.predict(X_test))
print(AUC_max_test)
print(AUC_max_train)

0.673072032223
0.681680322576


In [274]:
#Finding Coefficients of Logistic Model
pd.DataFrame(list(zip(X_train.columns, np.transpose(model_logistic.coef_)))).sort([1], ascending=False)



  from ipykernel import kernelapp as app


Unnamed: 0,0,1
6,uber_black_user,[0.896555178036]
10,city_num,[0.843899111004]
13,Highest_rating_customer,[0.219593697025]
5,trips_in_first_30_days,[0.0974686620316]
2,avg_rating_of_driver,[0.086829094346]
9,weekday,[0.010264078414]
4,surge_pct,[0.00446053862299]
7,weekday_pct,[-1.84688806154e-06]
0,avg_dist,[-0.0340873200731]
12,Highest_rating_driver,[-0.186800083669]


In [93]:
# Checking out knn, SVC and Random Forests
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier


# Initialize the three models
model_KNN = KNeighborsClassifier()
model_SVC = SVC(random_state=2)
model_RF = RandomForestClassifier(random_state=2)
model_KNN.fit(X_train, y_train)
model_SVC.fit(X_train, y_train)
model_RF.fit(X_train, y_train)

#Comparing AUC scores
print("KNN AUC score on test is:",roc_auc_score(y_test, model_KNN.predict(X_test)) ) 
print("SVC AUC score on test is:",roc_auc_score(y_test, model_SVC.predict(X_test)) ) 
print("RF AUC score on test is:",roc_auc_score(y_test, model_RF.predict(X_test)) ) 

KNN AUC score on test is: 0.694348523764
SVC AUC score on test is: 0.70808615125
RF AUC score on test is: 0.714526601561


In [271]:
#Carrying out Cross validation for finding out optimal parameters for Random forests
max_features_array=range(1, 13, 1)
max_depth_array=range(1,10,1)
AUC_score_max=0
max_features_max=10
max_depth_max=10
for max_depth in max_depth_array:
    for max_features in max_features_array:
        Cross_validation_scores = cross_val_score(RandomForestClassifier(max_features=max_features,max_depth=max_depth), X_train, y_train, scoring='roc_auc', cv=10)
        AUC_score=Cross_validation_scores.mean()
        if AUC_score_max<AUC_score:
            max_features_max=max_features
            max_depth_max=max_depth
            AUC_score_max=AUC_score
print("AUC score mean on crossvalidation",AUC_score_max)
print("Optimum depth of tree",max_depth)
print("Optimum max features",max_features)


AUC score mean on crossvalidation 0.849070210668
Optimum depth of tree 9
Optimum max features 12


In [272]:
#Finding out Random forest performance on test set

model_RF = RandomForestClassifier(max_features=max_features_max,max_depth=max_depth_max)
model_RF.fit(X_train, y_train)
AUC_max_train=roc_auc_score(y_train, model_RF.predict(X_train))
AUC_max_test=roc_auc_score(y_test, model_RF.predict(X_test))
print(AUC_max_train)
print(AUC_max_test)

0.778728575446
0.750939767099


In [239]:
#Finding out feature importaance 
pd.DataFrame(list(zip(X_train.columns, np.transpose(model_RF.feature_importances_)))).sort([1], ascending=False)


  if __name__ == '__main__':


Unnamed: 0,0,1
4,surge_pct,0.269352
7,weekday_pct,0.166123
11,city_num,0.155667
3,avg_surge,0.109735
12,phone_num,0.078822
6,uber_black_user,0.058704
0,avg_dist,0.054858
5,trips_in_first_30_days,0.048754
10,monthday,0.025955
9,weekday,0.010613
