In [1]:
import pandas as pd
import numpy as np
import sklearn
import datetime

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
import xgboost as xgb

from sklearn import metrics
from sklearn.metrics import accuracy_score, roc_auc_score, auc

In [10]:
equity_backup = equity_data

In [2]:
equity_data = pd.read_csv('equity_value_data.csv', header=0, engine='python')

In [4]:
equity_data.shape

(1119158, 3)

In [3]:
equity_data['timestamp'] = pd.to_datetime(equity_data['timestamp'], format = "%Y-%m-%d")

In [4]:
equity_data.head()

Unnamed: 0,timestamp,close_equity,user_id
0,2016-11-16,48.16,bcef4fa9b0bdf22bcf7deae708decf03
1,2016-11-17,48.16,bcef4fa9b0bdf22bcf7deae708decf03
2,2016-11-18,48.16,bcef4fa9b0bdf22bcf7deae708decf03
3,2016-11-21,48.16,bcef4fa9b0bdf22bcf7deae708decf03
4,2016-11-22,48.16,bcef4fa9b0bdf22bcf7deae708decf03


In [5]:
equity_data = equity_data.sort_values(['user_id', 'timestamp'], axis=0)

In [6]:
equity_data['date_diff'] = equity_data.groupby('user_id')['timestamp'].diff()
equity_data.head()

Unnamed: 0,timestamp,close_equity,user_id,date_diff
505994,2016-08-18,1211.6055,0012db34aa7b083f5714e7831195e54d,NaT
505995,2016-08-19,1173.564,0012db34aa7b083f5714e7831195e54d,1 days
505996,2016-08-22,1253.0597,0012db34aa7b083f5714e7831195e54d,3 days
505997,2016-08-23,1252.905,0012db34aa7b083f5714e7831195e54d,1 days
505998,2016-08-24,1262.136,0012db34aa7b083f5714e7831195e54d,1 days


In [7]:
equity_data1 = equity_data

In [8]:
equity_data['churn'] = (equity_data['date_diff'] > datetime.timedelta(days = 28))*1

In [9]:
# remove duplicate user_id in churned users
churned = list(equity_data[equity_data['churn'] == 1]['user_id'].unique())

In [10]:
total_users = list(equity_data['user_id'].unique())
percent_churned = (len(churned)/len(total_users))*100
print('Percentage churned = {}%'.format(percent_churned))

Percentage churned = 4.924785100286533%


In [11]:
len(total_users)

5584

In [12]:
user_df = equity_data[['user_id', 'churn']]

In [13]:
print(user_df.shape)
user_df = user_df.drop_duplicates()
print(user_df.shape)

(1119158, 2)
(5859, 2)


In [14]:
user_df = user_df.groupby(['user_id'], sort=False)['churn'].max().reset_index()
user_df.head()

Unnamed: 0,user_id,churn
0,0012db34aa7b083f5714e7831195e54d,0
1,001d6c77dbdb3213cead7673f250bfdc,0
2,002e4653171ddc61c3cd30603cd7bd3e,0
3,00384fa9be6fdca1b786bae70d78f88f,0
4,0042aac295a0d4df88f4b83012778bd4,0


In [15]:
user_df.shape

(5584, 2)

In [42]:
features = pd.read_csv('features_data.csv', header = 0, engine = 'python')

In [43]:
features.head()

Unnamed: 0,risk_tolerance,investment_experience,liquidity_needs,platform,time_spent,instrument_type_first_traded,first_deposit_amount,time_horizon,user_id
0,high_risk_tolerance,limited_investment_exp,very_important_liq_need,Android,33.129417,stock,40.0,med_time_horizon,895044c23edc821881e87da749c01034
1,med_risk_tolerance,limited_investment_exp,very_important_liq_need,Android,16.573517,stock,200.0,short_time_horizon,458b1d95441ced242949deefe8e4b638
2,med_risk_tolerance,limited_investment_exp,very_important_liq_need,iOS,10.008367,stock,25.0,long_time_horizon,c7936f653d293479e034865db9bb932f
3,med_risk_tolerance,limited_investment_exp,very_important_liq_need,Android,1.031633,stock,100.0,short_time_horizon,b255d4bd6c9ba194d3a350b3e76c6393
4,high_risk_tolerance,limited_investment_exp,very_important_liq_need,Android,8.18725,stock,20.0,long_time_horizon,4a168225e89375b8de605cbc0977ae91


In [44]:
features.shape

(5584, 9)

In [45]:
features.dtypes

risk_tolerance                   object
investment_experience            object
liquidity_needs                  object
platform                         object
time_spent                      float64
instrument_type_first_traded     object
first_deposit_amount            float64
time_horizon                     object
user_id                          object
dtype: object

In [46]:
features.describe()

Unnamed: 0,time_spent,first_deposit_amount
count,5584.0,5584.0
mean,34.509706,633.566805
std,155.080551,2118.323263
min,0.0,0.0
25%,2.848908,50.0
50%,13.474708,100.0
75%,33.823829,500.0
max,8788.32945,50000.0


In [21]:
# check if any missing values in the dataset
features.isnull().sum().sum()

0

In [22]:
features.groupby('platform').count()

Unnamed: 0_level_0,risk_tolerance,investment_experience,liquidity_needs,time_spent,instrument_type_first_traded,first_deposit_amount,time_horizon,user_id
platform,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Android,1529,1529,1529,1529,1529,1529,1529,1529
both,505,505,505,505,505,505,505,505
iOS,3550,3550,3550,3550,3550,3550,3550,3550


In [23]:
categorical_cols = [ 'risk_tolerance', 'time_horizon', 'investment_experience', 'liquidity_needs', 'platform', 'instrument_type_first_traded']
features = pd.get_dummies(features,  columns = categorical_cols)

In [24]:
features.head()

Unnamed: 0,time_spent,first_deposit_amount,user_id,risk_tolerance_high_risk_tolerance,risk_tolerance_low_risk_tolerance,risk_tolerance_med_risk_tolerance,time_horizon_long_time_horizon,time_horizon_med_time_horizon,time_horizon_short_time_horizon,investment_experience_extensive_investment_exp,...,instrument_type_first_traded_adr,instrument_type_first_traded_cef,instrument_type_first_traded_etp,instrument_type_first_traded_lp,instrument_type_first_traded_mlp,instrument_type_first_traded_reit,instrument_type_first_traded_rlt,instrument_type_first_traded_stock,instrument_type_first_traded_tracking,instrument_type_first_traded_wrt
0,33.129417,40.0,895044c23edc821881e87da749c01034,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
1,16.573517,200.0,458b1d95441ced242949deefe8e4b638,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
2,10.008367,25.0,c7936f653d293479e034865db9bb932f,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,1.031633,100.0,b255d4bd6c9ba194d3a350b3e76c6393,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,8.18725,20.0,4a168225e89375b8de605cbc0977ae91,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [25]:
print(features.shape)

(5584, 30)


In [26]:
features = pd.merge(features, user_df, on = 'user_id', how = 'inner')
print(features.shape)

(5584, 31)


In [27]:
features["churn"] = features["churn"].astype(int)
Y = features["churn"].values
X = features.drop(labels = ["churn", 'user_id'],axis = 1)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=747)

In [28]:
model = LogisticRegression(solver='liblinear')
result = model.fit(X_train, y_train)

In [29]:
lr_pred = model.predict(X_test)
# Print the prediction accuracy
print (metrics.accuracy_score(y_test, lr_pred))

0.9534883720930233


In [30]:
fpr, tpr, thresholds = metrics.roc_curve(y_test, lr_pred)
metrics.auc(fpr, tpr)

0.5

In [116]:
sum(y_test)

62

In [31]:
# f1 score
print(metrics.f1_score(y_test, lr_pred))

# recall score
print(metrics.recall_score(y_test, lr_pred))

0.0
0.0


  'precision', 'predicted', average, warn_for)


In [32]:
clf = LogisticRegression(class_weight='balanced') #class-weight = 'balanced' - trade-off with precision
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
print("accuracy_score",accuracy_score(y_test, predictions))
print("auc",roc_auc_score(y_test, predictions))
lr = [clf.__class__,accuracy_score(y_test, predictions),roc_auc_score(y_test, predictions)]

accuracy_score 0.6386404293381037
auc 0.5544090056285179




In [38]:
clf_RF = RandomForestClassifier(n_estimators=20)
clf_RF.fit(X_train, y_train)
predictions_RF = clf_RF.predict(X_test)
print("accuracy_score",accuracy_score(y_test, predictions_RF))
print("auc",roc_auc_score(y_test, predictions_RF))
rf = [clf_RF.__class__,accuracy_score(y_test, predictions_RF),roc_auc_score(y_test, predictions_RF)]

accuracy_score 0.9445438282647585
auc 0.49530956848030017


In [39]:
clf_AdBoost = AdaBoostClassifier(n_estimators=5)
clf_AdBoost.fit(X_train, y_train)
predictions_AdBoost = clf_AdBoost.predict(X_test)
print("accuracy_score",accuracy_score(y_test, predictions_AdBoost))
print("auc",roc_auc_score(y_test, predictions_AdBoost))
rf = [clf_RF.__class__,accuracy_score(y_test, predictions_AdBoost),roc_auc_score(y_test, predictions_AdBoost)]

accuracy_score 0.9284436493738819
auc 0.5417448405253283


In [103]:
# To get the weights of all the variables
weights = pd.Series(model.coef_[0], index=X.columns.values)
weights.sort_values(ascending = False)

risk_tolerance_low_risk_tolerance                 0.349263
instrument_type_first_traded_lp                   0.309690
instrument_type_first_traded_wrt                  0.303845
instrument_type_first_traded_0                    0.135006
instrument_type_first_traded_cef                  0.084164
platform_both                                     0.067579
investment_experience_limited_investment_exp      0.053049
first_deposit_amount                             -0.000128
time_spent                                       -0.000171
instrument_type_first_traded_tracking            -0.047216
investment_experience_no_investment_exp          -0.054104
liquidity_needs_not_important_liq_need           -0.073200
instrument_type_first_traded_mlp                 -0.157848
instrument_type_first_traded_etp                 -0.159124
instrument_type_first_traded_rlt                 -0.173902
investment_experience_good_investment_exp        -0.220510
time_horizon_short_time_horizon                  -0.2267

In [132]:
X_train.shape

(4467, 29)

In [138]:
sum(y_train)

213

In [140]:
(4467-213)/4467

0.9523169912693082

In [151]:
X.columns

Index(['time_spent', 'first_deposit_amount',
       'risk_tolerance_high_risk_tolerance',
       'risk_tolerance_low_risk_tolerance',
       'risk_tolerance_med_risk_tolerance', 'time_horizon_long_time_horizon',
       'time_horizon_med_time_horizon', 'time_horizon_short_time_horizon',
       'investment_experience_extensive_investment_exp',
       'investment_experience_good_investment_exp',
       'investment_experience_limited_investment_exp',
       'investment_experience_no_investment_exp',
       'liquidity_needs_not_important_liq_need',
       'liquidity_needs_somewhat_important_liq_need',
       'liquidity_needs_very_important_liq_need', 'platform_Android',
       'platform_both', 'platform_iOS', 'instrument_type_first_traded_0',
       'instrument_type_first_traded_adr', 'instrument_type_first_traded_cef',
       'instrument_type_first_traded_etp', 'instrument_type_first_traded_lp',
       'instrument_type_first_traded_mlp', 'instrument_type_first_traded_reit',
       'instrum

In [None]:
Class_weight - tradeoff with precision
To Do: upsampling/SMOTE

In [34]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=747, ratio=1.0)
X_train, y_train = sm.fit_sample(X_train, y_train)

In [40]:
smote = LogisticRegression().fit(X_train, y_train)

smote_pred = smote.predict(X_test)

print(accuracy_score(y_test, smote_pred))

print(metrics.f1_score(y_test, smote_pred))

print(metrics.recall_score(y_test, smote_pred))

0.6189624329159212
0.08583690987124463
0.38461538461538464




In [41]:
features.to_csv('features.csv')

In [47]:
features.to_csv('features_beforemoedifying.csv')