In [1]:
import pandas as pd
import numpy as np
from datetime import date

from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.pipeline import Pipeline

from sklearn.metrics import confusion_matrix, accuracy_score 
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

from sklearn.metrics import classification_report, precision_recall_curve
from sklearn.metrics import auc, roc_auc_score, roc_curve


import random
random.seed(20)

In [2]:
features_data = pd.read_csv("features_data.csv")
equity_value_data = pd.read_csv("equity_value_data.csv")

In [3]:
'''Function to extract date in %%y-%%m-%%d'''
def todate(timestamp):
    
    s = timestamp.split("-")
    year = s[0]
    month = s[1]
    date = s[2] 
    date = year+'-'+month+'-'+date[0]+date[1]
    return date


'''Function to get the difference between the dates'''
def date_diff(date_,date1_):
    
    d = date_.split("-")
    d_ = date1_.split("-")
    
    d0 = date(int(d[0]), int(d[1]), int(d[2]))
    d1 = date(int(d_[0]), int(d_[1]), int(d_[2]))
    delta = d1 - d0
    return delta.days


In [4]:
'''To create date column in equity_value_data'''
equity_value_data['date'] = equity_value_data['timestamp'].apply(todate)
equity_value_data.head()

Unnamed: 0,timestamp,close_equity,user_id,date
0,2016-11-16T00:00:00Z,48.16,bcef4fa9b0bdf22bcf7deae708decf03,2016-11-16
1,2016-11-17T00:00:00Z,48.16,bcef4fa9b0bdf22bcf7deae708decf03,2016-11-17
2,2016-11-18T00:00:00Z,48.16,bcef4fa9b0bdf22bcf7deae708decf03,2016-11-18
3,2016-11-21T00:00:00Z,48.16,bcef4fa9b0bdf22bcf7deae708decf03,2016-11-21
4,2016-11-22T00:00:00Z,48.16,bcef4fa9b0bdf22bcf7deae708decf03,2016-11-22


In [5]:
%%time

##############################################################################################################
#####################  Calculating Churn user by applying 28 consecutive days condition  #####################
##############################################################################################################

cnt = 0

#To save Churn users
churn_user = []

for i in range(len(equity_value_data['user_id'])-1):
    
    if equity_value_data['user_id'][i] == equity_value_data['user_id'][i+1]:
        
        days = date_diff(equity_value_data['date'][i],equity_value_data['date'][i+1])
        
        if days >=28 and (equity_value_data['user_id'][i] not in churn_user):
            
            churn_user.append(equity_value_data['user_id'][i])
            cnt += 1

            
Total_User = len(features_data['user_id'])

print("Total user: ", Total_User)
print("Total churn user: ", cnt)
print("Percentage: ",round(cnt / Total_User * 100, 2),"%")

Total user:  5584
Total churn user:  279
Percentage:  5.0 %
Wall time: 30.6 s


In [6]:
'''Adding Churn feature in the features_data'''

features_data['Churn'] = 0
for i in range(len(features_data['user_id'])):
    if features_data['user_id'][i] not in churn_user:
        features_data['Churn'][i] = 0
    else:
        features_data['Churn'][i] = 1     

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features_data['Churn'][i] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features_data['Churn'][i] = 1


In [7]:
features_data.head()

Unnamed: 0,risk_tolerance,investment_experience,liquidity_needs,platform,time_spent,instrument_type_first_traded,first_deposit_amount,time_horizon,user_id,Churn
0,high_risk_tolerance,limited_investment_exp,very_important_liq_need,Android,33.129417,stock,40.0,med_time_horizon,895044c23edc821881e87da749c01034,0
1,med_risk_tolerance,limited_investment_exp,very_important_liq_need,Android,16.573517,stock,200.0,short_time_horizon,458b1d95441ced242949deefe8e4b638,0
2,med_risk_tolerance,limited_investment_exp,very_important_liq_need,iOS,10.008367,stock,25.0,long_time_horizon,c7936f653d293479e034865db9bb932f,0
3,med_risk_tolerance,limited_investment_exp,very_important_liq_need,Android,1.031633,stock,100.0,short_time_horizon,b255d4bd6c9ba194d3a350b3e76c6393,0
4,high_risk_tolerance,limited_investment_exp,very_important_liq_need,Android,8.18725,stock,20.0,long_time_horizon,4a168225e89375b8de605cbc0977ae91,0


In [8]:
data = features_data.copy()
data.columns

Index(['risk_tolerance', 'investment_experience', 'liquidity_needs',
       'platform', 'time_spent', 'instrument_type_first_traded',
       'first_deposit_amount', 'time_horizon', 'user_id', 'Churn'],
      dtype='object')

In [9]:
data.drop('user_id',axis = 1, inplace= True)

In [10]:
print(data["Churn"].value_counts())

0    5305
1     279
Name: Churn, dtype: int64


In [11]:
df = pd.get_dummies(data, prefix=['Tolerance_','Exp_',"liq_",'pltform_','first_traded','time_'])

In [12]:
### Train-Test Split:

Y = df["Churn"]
X = df.drop(columns="Churn")

X_train, X_test, y_train, y_test = train_test_split(X, Y,stratify=Y, test_size = 0.2,random_state=20)


### Scaling:

sc_X = StandardScaler()
X_train2 = pd.DataFrame(sc_X.fit_transform(X_train))
X_train2.columns = X_train.columns.values
X_train2.index = X_train.index.values
X_train = X_train2

X_test2 = pd.DataFrame(sc_X.transform(X_test))
X_test2.columns = X_test.columns.values
X_test2.index = X_test.index.values
X_test = X_test2

In [13]:
def evaluate(y_test, y_pred):
    #Evaluate results
    AUC = roc_auc_score(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred )
    prec = precision_score(y_test, y_pred )
    rec = recall_score(y_test, y_pred )
    f1 = f1_score(y_test, y_pred )
    
    model_results = pd.DataFrame([['Random Forest', AUC, acc, prec, rec, f1]], columns = ['Model', 'AUC','Accuracy', 
                                                                                      'Precision', 'Recall', 'F1 Score'])
    results = model_results.sort_values(["AUC","Precision", "Recall"], ascending = False)
    
    return results


In [14]:
RF_cl = RandomForestClassifier(n_estimators = 72, criterion = 'entropy', class_weight='balanced' , max_depth=5)

In [15]:
RF_cl.fit(X_train, y_train)

# Predicting the Test set results 
y_pred = RF_cl.predict(X_test)
print (evaluate(y_test, y_pred))

print(classification_report(y_test, y_pred))

           Model       AUC  Accuracy  Precision    Recall  F1 Score
0  Random Forest  0.552797  0.760967   0.072874  0.321429  0.118812
              precision    recall  f1-score   support

           0       0.96      0.78      0.86      1061
           1       0.07      0.32      0.12        56

    accuracy                           0.76      1117
   macro avg       0.51      0.55      0.49      1117
weighted avg       0.91      0.76      0.82      1117



In [16]:
RF_cl = RandomForestClassifier(n_estimators = 150, criterion = 'entropy', class_weight= 'balanced' , max_depth=10)

In [17]:
RF_cl.fit(X_train, y_train)

# Predicting the Test set results 
y_pred = RF_cl.predict(X_test)

print (evaluate(y_test, y_pred))

print(classification_report(y_test, y_pred))

           Model       AUC  Accuracy  Precision    Recall  F1 Score
0  Random Forest  0.558654  0.900627   0.133333  0.178571  0.152672
              precision    recall  f1-score   support

           0       0.96      0.94      0.95      1061
           1       0.13      0.18      0.15        56

    accuracy                           0.90      1117
   macro avg       0.54      0.56      0.55      1117
weighted avg       0.91      0.90      0.91      1117



# Recursive feature elimination with cross-validation to select the number of features:

In [18]:
from sklearn.feature_selection import RFECV

In [19]:
selector = RFECV(RF_cl, step=1, min_features_to_select=15 ,cv=5, scoring='recall')
selector = selector.fit(X_train, y_train)

y_pred = selector.predict(X_test)
print (evaluate(y_test, y_pred))
print(classification_report(y_test, y_pred))

           Model       AUC  Accuracy  Precision    Recall  F1 Score
0  Random Forest  0.541268  0.899731   0.111111  0.142857     0.125
              precision    recall  f1-score   support

           0       0.95      0.94      0.95      1061
           1       0.11      0.14      0.12        56

    accuracy                           0.90      1117
   macro avg       0.53      0.54      0.54      1117
weighted avg       0.91      0.90      0.91      1117



# Undersampling:

In [33]:
rus = RandomUnderSampler() 

# resampling X, y
X_rus, y_rus = rus.fit_resample(X_train, y_train)


print(f'''Shape of X before SMOTE: {X_train.shape} Shape of X after SMOTE: {X_rus.shape}''')
print(f'''Shape of X before SMOTE: {y_train.shape} Shape of X after SMOTE: {y_rus.shape}''')

print('\nBalance of positive and negative classes (%):')
y_rus.value_counts(normalize=True) * 100

Shape of X before SMOTE: (4467, 29) Shape of X after SMOTE: (446, 29)
Shape of X before SMOTE: (4467,) Shape of X after SMOTE: (446,)

Balance of positive and negative classes (%):


1    50.0
0    50.0
Name: Churn, dtype: float64

In [34]:
# Recursive feature elimination with cross-validation 

selector = RFECV(RF_cl, step=1, min_features_to_select=15 ,cv=5, scoring='recall')
selector = selector.fit(X_rus, y_rus)

y_pred = selector.predict(X_test)
#print (evaluate(y_test, y_pred))
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.56      0.70      1061
           1       0.05      0.48      0.10        56

    accuracy                           0.55      1117
   macro avg       0.50      0.52      0.40      1117
weighted avg       0.91      0.55      0.67      1117



In [35]:
RF_cl = RandomForestClassifier(n_estimators = 150, criterion = 'entropy' , max_depth=25)

In [36]:
selector = RFECV(RF_cl, step=2, min_features_to_select=20 ,cv=5, scoring='recall')
selector = selector.fit(X_rus, y_rus)

y_pred = selector.predict(X_test)
#print (evaluate(y_test, y_pred))
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.55      0.70      1061
           1       0.06      0.50      0.10        56

    accuracy                           0.55      1117
   macro avg       0.51      0.53      0.40      1117
weighted avg       0.91      0.55      0.67      1117



# Principal Component Analysis:

In [58]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X_trn = pca.fit_transform(X_rus)
X_tst = pca.fit_transform(X_test)

In [59]:
selector = RFECV(RF_cl, step=2, min_features_to_select=25 ,cv=5, scoring='recall')
selector = selector.fit(X_trn, y_rus)

In [60]:
y_pred = selector.predict(X_tst)
#print (evaluate(y_test, y_pred))
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.35      0.52      1061
           1       0.06      0.79      0.11        56

    accuracy                           0.38      1117
   macro avg       0.51      0.57      0.31      1117
weighted avg       0.92      0.38      0.50      1117

