## Decision Tree

### Model 1 - RFE20

In [6]:
import pandas as pd 

df_train_data = pd.read_csv('final_train_data_StdSc.csv')
df_test_data=pd.read_csv('final_test_data_StdSc.csv')

In [7]:
# RFE20
target_train = df_train_data[['is_churn']]

data_traintes = df_train_data[['registered_via_9','registered_via_4','registered_via_3','avg_actual_amount_paid','avg_plan_list_price','active_days','total_cancel','is_auto_renew','most_fq_payment_method_id','number_of_days_lastthree_listened','number_of_days_lasttwo_listened','num_totalsec_lasttwo_mean','num_100_lasttwo_mean','num_unq_lasttwo_sum','number_of_days_201701_listened','num_unq_201701_sum','number_of_days_201702_listened','num_totalsec_201702_mean','num_totalsec_201702_sum','number_of_days_listened',]]

target_test = df_test_data[['is_churn']]

data_test =df_test_data[['registered_via_9','registered_via_4','registered_via_3','avg_actual_amount_paid','avg_plan_list_price','active_days','total_cancel','is_auto_renew','most_fq_payment_method_id','number_of_days_lastthree_listened','number_of_days_lasttwo_listened','num_totalsec_lasttwo_mean','num_100_lasttwo_mean','num_unq_lasttwo_sum','number_of_days_201701_listened','num_unq_201701_sum','number_of_days_201702_listened','num_totalsec_201702_mean','num_totalsec_201702_sum','number_of_days_listened',]]


In [3]:
# GridSearch: tunning parameter
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

# Set the parameters by cross-validation
parameters = [{"criterion": ["gini", "entropy"],
              "min_samples_split": [2, 10, 20],
              "max_depth": [None, 5, 10, 15],
              "min_samples_leaf": [1, 5, 10],
              "max_leaf_nodes": [None, 5, 10, 20],
              }]

tree = GridSearchCV(DecisionTreeClassifier(), parameters, cv=5)
tree.fit(data_train, target_train)

print(tree.best_params_)

{'criterion': 'entropy', 'max_depth': 10, 'max_leaf_nodes': None, 'min_samples_leaf': 10, 'min_samples_split': 10}


In [8]:
# apply decision tree model
from sklearn.tree import DecisionTreeClassifier
decision_tree = DecisionTreeClassifier(criterion='entropy',min_impurity_decrease =0.00015,max_depth=15, max_leaf_nodes=None, min_samples_leaf=10, min_samples_split=10)
decision_tree.fit(data_train, target_train)

prediction = decision_tree.predict(data_test)

# Performance evaluation

from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
from sklearn.metrics import roc_auc_score

def confusion_matrix_report(y_true, y_pred):
    cm, labels = confusion_matrix(y_true, y_pred), unique_labels(y_true, y_pred)
    column_width = max([len(str(x)) for x in labels] + [5])  # 5 is value length
    report = " " * column_width + " " + "{:_^{}}".format("Prediction", column_width * len(labels))+ "\n"
    report += " " * column_width + " ".join(["{:>{}}".format(label, column_width) for label in labels]) + "\n"
    for i, label1 in enumerate(labels):
        report += "{:>{}}".format(label1, column_width) + " ".join(["{:{}d}".format(cm[i, j], column_width) for j in range(len(labels))]) + "\n"
    return report

def loggloss(target_test, model, data_test):
    probabilities=model.predict_proba(data_test)
    value=log_loss(target_test, probabilities)
    return value

def AUC(target_test, model, data_test):
    values=model.predict_proba(data_test)[:,1]
    auc_score=roc_auc_score(target_test, values)
    return auc_score
    
def analytics(target_test, model, data_test):#target of the test data #predictions as 0,1 #model (knnclassifier) #data_test
    y_pred=model.predict(data_test)
    print("Confusion Matrix:")
    print(confusion_matrix_report(target_test,prediction))
    print("Accuracy Score:")
    print(accuracy_score(target_test,prediction))
    print()
    print("Classification Report:")
    print(classification_report(target_test,prediction))
    print("Log Loss:")
    print(loggloss(target_test, model, data_test))
    print()
    print("AUC Score:")
    print(AUC(target_test, model, data_test))
    
analytics(target_test, decision_tree, data_test)


Confusion Matrix:
      Prediction
         0     1
    0220373 25177
    1 1435 15858

Accuracy Score:
0.898753248137

Classification Report:
             precision    recall  f1-score   support

          0       0.99      0.90      0.94    245550
          1       0.39      0.92      0.54     17293

avg / total       0.95      0.90      0.92    262843

Log Loss:
0.19883516365

AUC Score:
0.969856739738


### Model 2

In [10]:
# F-classfication:20
target_train = df_train_data[['is_churn']]

data_train = df_train_data[['gender_2','registered_via_3','registered_via_4','registered_via_7','avg_actual_amount_paid','avg_plan_list_price','active_days','total_cancel','is_auto_renew','total_churn','number_of_days_lastthree_listened','number_of_days_lasttwo_listened','number_of_days_201702_listened','num_totalsec_201702_sum','num_unq_201702_sum','num_100_201702_sum','num_985_201702_sum','num_75_201702_sum','num_50_201702_sum','num_25_201702_sum',]]

target_test = df_test_data[['is_churn']]

data_test =df_test_data[['gender_2','registered_via_3','registered_via_4','registered_via_7','avg_actual_amount_paid','avg_plan_list_price','active_days','total_cancel','is_auto_renew','total_churn','number_of_days_lastthree_listened','number_of_days_lasttwo_listened','number_of_days_201702_listened','num_totalsec_201702_sum','num_unq_201702_sum','num_100_201702_sum','num_985_201702_sum','num_75_201702_sum','num_50_201702_sum','num_25_201702_sum',]]

# apply decision tree model
decision_tree = DecisionTreeClassifier(criterion='entropy',min_impurity_decrease =0.0002,max_depth=10, max_leaf_nodes=None, min_samples_leaf=10, min_samples_split=10)
decision_tree
decision_tree.fit(data_train, target_train)

prediction = decision_tree.predict(data_test)


# Performance evaluation

analytics(target_test, decision_tree, data_test)


Confusion Matrix:
      Prediction
         0     1
    0220019 25531
    1 1464 15829

Accuracy Score:
0.897296104519

Classification Report:
             precision    recall  f1-score   support

          0       0.99      0.90      0.94    245550
          1       0.38      0.92      0.54     17293

avg / total       0.95      0.90      0.92    262843

Log Loss:
0.202091551281

AUC Score:
0.968650367568


### Model 3

In [11]:
# Mutual Information20

target_train = df_train_data[['is_churn']]

data_train = df_train_data[['registered_via_7','avg_actual_amount_paid','avg_plan_list_price','active_days','total_cancel','is_auto_renew','total_churn','most_fq_payment_method_id','number_of_days_lasttwo_listened','number_of_days_201702_listened','num_unq_201702_mean','num_100_201702_mean','num_75_201702_mean','num_50_201702_mean','num_25_201702_mean','num_totalsec_201702_sum','num_unq_201702_sum','num_100_201702_sum','num_75_201702_sum','num_50_201702_sum',]]

target_test = df_test_data[['is_churn']]

data_test =df_test_data[['registered_via_7','avg_actual_amount_paid','avg_plan_list_price','active_days','total_cancel','is_auto_renew','total_churn','most_fq_payment_method_id','number_of_days_lasttwo_listened','number_of_days_201702_listened','num_unq_201702_mean','num_100_201702_mean','num_75_201702_mean','num_50_201702_mean','num_25_201702_mean','num_totalsec_201702_sum','num_unq_201702_sum','num_100_201702_sum','num_75_201702_sum','num_50_201702_sum',]]


# apply decision tree model
decision_tree = DecisionTreeClassifier(criterion='entropy',min_impurity_decrease =0.0002,max_depth=10, max_leaf_nodes=None, min_samples_leaf=10, min_samples_split=10)
decision_tree
decision_tree.fit(data_train, target_train)

prediction = decision_tree.predict(data_test)

analytics(target_test, decision_tree, data_test)


Confusion Matrix:
      Prediction
         0     1
    0219970 25580
    1 1436 15857

Accuracy Score:
0.897216208916

Classification Report:
             precision    recall  f1-score   support

          0       0.99      0.90      0.94    245550
          1       0.38      0.92      0.54     17293

avg / total       0.95      0.90      0.92    262843

Log Loss:
0.203054885746

AUC Score:
0.969177671581


### Model 4

In [12]:
# PCA21
target_train = df_train_data[['is_churn']]

data_train = df_train_data[['num_75_lasttwo_mean','num_50_mean','num_100_201612_mean','num_100_201701_mean','num_above_50_sum','num_totalsec_sum','num_75_mean','num_totalsec_lastthree_sum','num_100_sum','num_100_lastthree_sum','num_50_lastthree_mean','num_75_lastthree_mean','num_totalsec_lasttwo_mean','num_totalsec_med','num_100_lasttwo_mean','num_100_med','num_totalsec_lastthree_mean','num_totalsec_mean','num_100_lastthree_mean','num_100_mean',]]

target_test = df_test_data[['is_churn']]

data_test =df_test_data[['num_75_lasttwo_mean','num_50_mean','num_100_201612_mean','num_100_201701_mean','num_above_50_sum','num_totalsec_sum','num_75_mean','num_totalsec_lastthree_sum','num_100_sum','num_100_lastthree_sum','num_50_lastthree_mean','num_75_lastthree_mean','num_totalsec_lasttwo_mean','num_totalsec_med','num_100_lasttwo_mean','num_100_med','num_totalsec_lastthree_mean','num_totalsec_mean','num_100_lastthree_mean','num_100_mean',]]


decision_tree = DecisionTreeClassifier(criterion='entropy',min_impurity_decrease =0.0002,max_depth=10, max_leaf_nodes=None, min_samples_leaf=10, min_samples_split=10)
decision_tree
decision_tree.fit(data_train, target_train)

prediction = decision_tree.predict(data_test)

analytics(target_test, decision_tree, data_test)


Confusion Matrix:
      Prediction
         0     1
    0175590 69960
    110460  6833

Accuracy Score:
0.694037885734

Classification Report:
             precision    recall  f1-score   support

          0       0.94      0.72      0.81    245550
          1       0.09      0.40      0.15     17293

avg / total       0.89      0.69      0.77    262843

Log Loss:
0.622831925644

AUC Score:
0.587216265403


### Model 5 

In [13]:
# Random Forest : 20
target_train = df_train_data[['is_churn']]

data_train = df_train_data[['num_50_201702_mean','num_25_201702_mean','num_unq_201702_sum','num_75_mean','proportion_songs_above_50','num_totalsec_max','num_25_201612_mean','num_unq_max','num_25_201702_sum','bd','num_25_max','number_of_days_listened','num_totalsec_min','most_fq_payment_method_id','avg_plan_list_price','number_of_days_201702_listened','total_cancel','avg_actual_amount_paid','is_auto_renew','active_days',]]

target_test = df_test_data[['is_churn']]

data_test =df_test_data[['num_50_201702_mean','num_25_201702_mean','num_unq_201702_sum','num_75_mean','proportion_songs_above_50','num_totalsec_max','num_25_201612_mean','num_unq_max','num_25_201702_sum','bd','num_25_max','number_of_days_listened','num_totalsec_min','most_fq_payment_method_id','avg_plan_list_price','number_of_days_201702_listened','total_cancel','avg_actual_amount_paid','is_auto_renew','active_days',]]


decision_tree = DecisionTreeClassifier(criterion='entropy',min_impurity_decrease =0.0002,max_depth=10, max_leaf_nodes=None, min_samples_leaf=10, min_samples_split=20)
decision_tree
decision_tree.fit(data_train, target_train)

prediction = decision_tree.predict(data_test)

analytics(target_test, decision_tree, data_test)

Confusion Matrix:
      Prediction
         0     1
    0219942 25608
    1 1388 15905

Accuracy Score:
0.897292299966

Classification Report:
             precision    recall  f1-score   support

          0       0.99      0.90      0.94    245550
          1       0.38      0.92      0.54     17293

avg / total       0.95      0.90      0.92    262843

Log Loss:
0.203455069433

AUC Score:
0.968875625644


### Model 6

In [16]:
# Correlation : 35
target_train = df_train_data[['is_churn']]

data_train = df_train_data[['num_100_201612_mean','num_25_201612_mean','num_unq_201701_sum','num_unq_201702_mean','num_75_201702_mean','num_985_201702_sum','num_totalsec_min','num_totalsec_max','num_unq_max','num_25_max','num_100_med','num_totalsec_sum','bd','registered_via_7','registered_via_4','registered_via_3','total_churn','num_totalsec_lasttwo_mean','num_50_201702_mean','num_25_201702_mean','num_100_201702_sum','num_50_201702_sum','num_25_201702_sum','number_of_days_listened','num_75_mean','most_fq_payment_method_id','avg_actual_amount_paid','active_days','total_cancel','is_auto_renew','number_of_days_201702_listened','num_75_lasttwo_mean','num_50_lastthree_mean','proportion_songs_above_50','registered_via_9',]]

target_test = df_test_data[['is_churn']]

data_test =df_test_data[['num_100_201612_mean','num_25_201612_mean','num_unq_201701_sum','num_unq_201702_mean','num_75_201702_mean','num_985_201702_sum','num_totalsec_min','num_totalsec_max','num_unq_max','num_25_max','num_100_med','num_totalsec_sum','bd','registered_via_7','registered_via_4','registered_via_3','total_churn','num_totalsec_lasttwo_mean','num_50_201702_mean','num_25_201702_mean','num_100_201702_sum','num_50_201702_sum','num_25_201702_sum','number_of_days_listened','num_75_mean','most_fq_payment_method_id','avg_actual_amount_paid','active_days','total_cancel','is_auto_renew','number_of_days_201702_listened','num_75_lasttwo_mean','num_50_lastthree_mean','proportion_songs_above_50','registered_via_9',]]


decision_tree = DecisionTreeClassifier(criterion='entropy',min_impurity_decrease =0.0002,max_depth=10, max_leaf_nodes=None, min_samples_leaf=10, min_samples_split=10)
decision_tree 
decision_tree.fit(data_train, target_train)

prediction = decision_tree.predict(data_test)

analytics(target_test, decision_tree, data_test)

Confusion Matrix:
      Prediction
         0     1
    0219554 25996
    1 1398 15895

Accuracy Score:
0.89577808806

Classification Report:
             precision    recall  f1-score   support

          0       0.99      0.89      0.94    245550
          1       0.38      0.92      0.54     17293

avg / total       0.95      0.90      0.91    262843

Log Loss:
0.208819520793

AUC Score:
0.969742408334


### Model 7

In [18]:
# Random Forest_cor : 18

target_train = df_train_data[['is_churn']]

data_train = df_train_data[['num_50_201702_mean','num_25_201702_mean','num_75_mean','proportion_songs_above_50','num_totalsec_max','num_25_201612_mean','num_unq_max','num_25_201702_sum','bd','num_25_max','number_of_days_listened','num_totalsec_min','most_fq_payment_method_id','number_of_days_201702_listened','total_cancel','avg_actual_amount_paid','is_auto_renew','active_days',]]

target_test = df_test_data[['is_churn']]

data_test =df_test_data[['num_50_201702_mean','num_25_201702_mean','num_75_mean','proportion_songs_above_50','num_totalsec_max','num_25_201612_mean','num_unq_max','num_25_201702_sum','bd','num_25_max','number_of_days_listened','num_totalsec_min','most_fq_payment_method_id','number_of_days_201702_listened','total_cancel','avg_actual_amount_paid','is_auto_renew','active_days',]]


decision_tree = DecisionTreeClassifier(criterion='entropy',min_impurity_decrease =0.0002,max_depth=10, max_leaf_nodes=None, min_samples_leaf=10, min_samples_split=10)
decision_tree
decision_tree.fit(data_train, target_train)

prediction = decision_tree.predict(data_test)

analytics(target_test, decision_tree, data_test)


Confusion Matrix:
      Prediction
         0     1
    0219924 25626
    1 1388 15905

Accuracy Score:
0.897223818021

Classification Report:
             precision    recall  f1-score   support

          0       0.99      0.90      0.94    245550
          1       0.38      0.92      0.54     17293

avg / total       0.95      0.90      0.92    262843

Log Loss:
0.20746925407

AUC Score:
0.96982329671


### Model 8

In [19]:
# Mutual Information_cor : 14
target_train = df_train_data[['is_churn']]

data_train = df_train_data[['registered_via_7','avg_actual_amount_paid','active_days','total_cancel','is_auto_renew','total_churn','most_fq_payment_method_id','number_of_days_201702_listened','num_unq_201702_mean','num_75_201702_mean','num_50_201702_mean','num_25_201702_mean','num_100_201702_sum','num_50_201702_sum',]]

target_test = df_test_data[['is_churn']]

data_test =df_test_data[['registered_via_7','avg_actual_amount_paid','active_days','total_cancel','is_auto_renew','total_churn','most_fq_payment_method_id','number_of_days_201702_listened','num_unq_201702_mean','num_75_201702_mean','num_50_201702_mean','num_25_201702_mean','num_100_201702_sum','num_50_201702_sum',]]


decision_tree = DecisionTreeClassifier(criterion='entropy',min_impurity_decrease =0.0002,max_depth=10, max_leaf_nodes=None, min_samples_leaf=10, min_samples_split=10)
decision_tree
decision_tree.fit(data_train, target_train)

prediction = decision_tree.predict(data_test)

analytics(target_test, decision_tree, data_test)


Confusion Matrix:
      Prediction
         0     1
    0220003 25547
    1 1458 15835

Accuracy Score:
0.897258058993

Classification Report:
             precision    recall  f1-score   support

          0       0.99      0.90      0.94    245550
          1       0.38      0.92      0.54     17293

avg / total       0.95      0.90      0.92    262843

Log Loss:
0.208377466049

AUC Score:
0.969849000758


### Model 9

In [20]:
#F-Classification_cor : 14
target_train = df_train_data[['is_churn']]

data_train = df_train_data[['bd','registered_via_3','registered_via_4','registered_via_7','avg_actual_amount_paid','active_days','total_cancel','is_auto_renew','total_churn','number_of_days_201702_listened','num_100_201702_sum','num_985_201702_sum','num_50_201702_sum','num_25_201702_sum',]]

target_test = df_test_data[['is_churn']]

data_test =df_test_data[['bd','registered_via_3','registered_via_4','registered_via_7','avg_actual_amount_paid','active_days','total_cancel','is_auto_renew','total_churn','number_of_days_201702_listened','num_100_201702_sum','num_985_201702_sum','num_50_201702_sum','num_25_201702_sum',]]

decision_tree = DecisionTreeClassifier(criterion='entropy',min_impurity_decrease =0.0002,max_depth=10, max_leaf_nodes=None, min_samples_leaf=10, min_samples_split=20)
decision_tree
decision_tree.fit(data_train, target_train)

prediction = decision_tree.predict(data_test)

analytics(target_test, decision_tree, data_test)

Confusion Matrix:
      Prediction
         0     1
    0219982 25568
    1 1441 15852

Accuracy Score:
0.897242840783

Classification Report:
             precision    recall  f1-score   support

          0       0.99      0.90      0.94    245550
          1       0.38      0.92      0.54     17293

avg / total       0.95      0.90      0.92    262843

Log Loss:
0.207994797444

AUC Score:
0.969048124211


### Model 10

In [21]:
#RFE_cor : 12
target_train = df_train_data[['is_churn']]

data_train = df_train_data[['registered_via_9','registered_via_4','registered_via_3','avg_actual_amount_paid','active_days','total_cancel','is_auto_renew','most_fq_payment_method_id','num_totalsec_lasttwo_mean','num_unq_201701_sum','number_of_days_201702_listened','number_of_days_listened',]]

target_test = df_test_data[['is_churn']]

data_test =df_test_data[['registered_via_9','registered_via_4','registered_via_3','avg_actual_amount_paid','active_days','total_cancel','is_auto_renew','most_fq_payment_method_id','num_totalsec_lasttwo_mean','num_unq_201701_sum','number_of_days_201702_listened','number_of_days_listened',]]

decision_tree = DecisionTreeClassifier(criterion='gini',min_impurity_decrease =0.00004,max_depth=15, max_leaf_nodes=None, min_samples_leaf=10, min_samples_split=20)
decision_tree
decision_tree.fit(data_train, target_train)

prediction = decision_tree.predict(data_test)

analytics(target_test, decision_tree, data_test)


Confusion Matrix:
      Prediction
         0     1
    0223201 22349
    1 1570 15723

Accuracy Score:
0.908998908093

Classification Report:
             precision    recall  f1-score   support

          0       0.99      0.91      0.95    245550
          1       0.41      0.91      0.57     17293

avg / total       0.95      0.91      0.92    262843

Log Loss:
0.213646010136

AUC Score:
0.968697165764
