In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#Read data from csv
bank_churners = pd.read_csv('/kaggle/input/credit-card-customers/BankChurners.csv')
print(f'Total recors: {len(bank_churners)}')
bank_churners.head() # peek into the data, check columns and kind of information

In [None]:
#investigate all the keys
for k in bank_churners.keys():
    print(k)


In [None]:
#Data cleanup for removing values 'Unknown'
bank_churners = bank_churners[bank_churners['Education_Level'] != 'Unknown']
bank_churners = bank_churners[bank_churners['Income_Category'] != 'Unknown']
bank_churners = bank_churners[bank_churners['Marital_Status'] != 'Unknown']
print(f"Education_Level: {bank_churners['Education_Level'].unique()}")
print(f"Card_Category: {bank_churners['Card_Category'].unique()}")
print(f"Income_Category: {bank_churners['Income_Category'].unique()}")
print(f"Attrition_Flag: {bank_churners['Attrition_Flag'].unique()}")
print(f"Gender: {bank_churners['Gender'].unique()}")
print(f"Marital_Status: {bank_churners['Marital_Status'].unique()}")
print(f"Dependent_count: {bank_churners['Dependent_count'].unique()}")
print(f"Months_on_book: {np.sort(bank_churners['Months_on_book'].unique())}")

In [None]:
#select few columns and clean data
df = bank_churners.dropna(subset=["Attrition_Flag", "Gender", "Dependent_count", "Card_Category", "Marital_Status", "Income_Category", "Education_Level"])
print(f'Now the total records are: {len(df)}')

In [None]:
i1 = df['Dependent_count'] > 3
i2 = df['Dependent_count'] < 4
df['Family_Size'] = 'Big Family'
df.loc[df['Dependent_count'] < 4, 'Family_Size'] = 'Small Family'
df['Association'] = 'Short Duration'
df.loc[df['Months_on_book'] > 20, 'Association'] = 'Medium Duration'
df.loc[df['Months_on_book'] > 40, 'Association'] = 'Long Duration'

columns = ["Attrition_Flag", "Income_Category","Family_Size", "Card_Category", 
           "Marital_Status","Gender",   "Education_Level", "Association"]
df = pd.DataFrame(df, columns=columns)

df.keys()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df, 
              df['Attrition_Flag'], test_size=0.30, random_state=42)


In [None]:
def compute_entropy(y):
    if len(y) < 2: #only one object
        return 0
    freq = np.array(y.value_counts(normalize=True))
    return -(freq * np.log2(freq+1e-6)).sum()

def compute_info_gain(samples, attribute, target):
    split_ent = 0
    values = samples[attribute].value_counts(normalize=True)
    print(f"IG Test Attribute: {attribute}, Target: {target}, Size: {len(samples)}")
    print(f"{values}")
    for v, freq in values.iteritems():
        samples_index = samples[attribute] == v
        samples_y = samples[samples_index]
        sub_entropy = compute_entropy(samples_y[target])
        split_ent += freq * sub_entropy
    #split_ent is calculated with weights by virtue of normalisation
    #Remember: sum of split subgroups entropy can't be greater than that of parent
    ent = compute_entropy(samples[target])
    return ent - split_ent



In [None]:
#Create the ID3 algorithm
#Evaluate entropy, create branches and check info gain
class TreeNode:
    def __init__(self,p_node,val, samples, target):
        #This information is needed till we create the tree
        self.decision = None
        self.samples = samples
        self.target = target
        self.parent_node = p_node
        #attribute on the values of which we will split the data
        self.split_attribute = None
        self.parent_split_attribute_value = val
    def getSplitAttribute(self):
        return self.split_attribute
    def make(self):
        target = self.target
        samples = self.samples
        if(self.parent_node):
            print(f'Node: Parent:SplitAttr:{self.parent_node.getSplitAttribute()}, Attribute Val: {self.parent_split_attribute_value}')
        if len(samples) < 1: # there are no elements in the given sample
            print("None on this leg")
            self.decision = "Existing Customer" #TODO: need to return specific class
            
        elif len(samples[target].unique()) == 1: # all belong to same class target
            self.decision = samples[target].unique()[0]
            if(self.parent_node):
                print(f'Clean: {self.decision}, {self.parent_node.getSplitAttribute()}')
            return
        else: # Now let us split the frame and calculate the information gain 
            # across the attributes
            ig_max = 0
            
            for a in samples.keys():
                if a == target: # in our case this would be attrition customer state
                    continue #skip it
                aig = compute_info_gain(samples, a, target)
                print(f"Split attribute: {a}, IG:: {aig}")
                if aig > ig_max:
                    # just saving as part of this note, what is used to further distributed                    self.split_attribute = a 
                    ig_max = aig
                    self.split_attribute = a
                    print(f"Better attribute: {a}, IG_Max updated: {ig_max}")
            print(f"split by {self.split_attribute}, IG: {ig_max:.6f}")
            if(ig_max == 0):
                print(f"Wasted Leg")
                return
            self.children = {}
            for v in samples[self.split_attribute].unique(): #unique values/keys to create class
                index = samples[self.split_attribute] == v
                self.children[v] = TreeNode(self, v, samples[index], target) # recurse
                # stop condition already captured
                self.children[v].make()
    def pretty_print(self, prefix=''):
        if self.split_attribute is not None:
            for k, v in self.children.items():
                v.pretty_print(f"{prefix}:when {self.split_attribute} is {k}")
        else:
            print(f"{prefix}:{self.decision}")            
    def predict(self, samples, i):
        decision = None
        if self.split_attribute is not None:
            print(f"Attribute: {self.split_attribute}, Value: {samples[self.split_attribute].iloc[i]}")            
            if samples[self.split_attribute].iloc[i] in self.children.keys():
                decision = self.children[samples[self.split_attribute].iloc[i]].predict(samples, i)
                return decision
        return self.decision
class TreeID3:
    def __init__(self):
        self.root = None
        
    def fit(self, samples, target):
        self.root = TreeNode(None,None, samples, target)
        self.root.make()
        return
    
    def test_model(self, test, target):
        correct = 0
        test['prediction'] = None
        for i in range(len(test)):
            test['prediction'].iloc[i] = self.root.predict(test, i)
            print(f"Prediction:{test['prediction'].iloc[i]}")
            if(test['prediction'].iloc[i] == test[target].iloc[i]):
                correct += 1
        # return accuracy from testing
        print(f"Predicted Correctly: {correct}")
        return correct / len(test)

In [None]:
node = TreeID3()
node.fit(X_train, "Attrition_Flag")
node.root.pretty_print()

In [None]:
accuracy = node.test_model(X_test, 'Attrition_Flag')
print(accuracy)

In [None]:
pred_existing_customers = len(X_test[X_test['prediction'] == 'Existing Customer'])
pred_attrited_customers = len(X_test[X_test['prediction'] == 'Attrited Customer'])
correct_predictions = len(X_test[X_test['prediction'] == X_test['Attrition_Flag']])
accuracy = correct_predictions / (pred_existing_customers + pred_attrited_customers)
print(f"Test data size: {len(X_test)}, Predicted: {pred_existing_customers+pred_attrited_customers}")
print(f"Model Capability to predict: {(pred_existing_customers+pred_attrited_customers)/len(X_test):.2%}")
print(f"Accuracy: {accuracy:.2%}")