In [17]:
training_data = [
    ['Green', 3, 'Mango'],
    ['Yellow', 3, 'Mango'],
    ['Red', 1, 'Grape'],
    ['Red', 1, 'Grape'],
    ['Yellow', 3, 'Lemon']
]

header=['color','diameter','label']

def unique_value(rows, col):#find the unique values in col for the data set 
    return set(row[col] for row in rows)

def class_counts(rows): # counts the no of each type of example in dataset
    counts = {}# a dictionary for label-> count
    for row in rows:
        label = row[-1]
        if label not in counts:
            counts[label] = 0
        counts[label]+=1
    return counts

def is_numeric(value):
    return isinstance(value, int) or isinstance(value, float)

class Question:
    """A Question is used to partition a dataset.
    
    This class is just recorf a 'column number' and a 'column value'. The 'match' method is used to compare the feature 
    value in an example to the feature value stored in the question."""
    
    def __init__(self, column, value):
        self.column=column
        self.value=value
    
    def match(self, example):
        '''Compare the feature value in the example to the feature value in this question'''
        val = example[self.column]
        if is_numeric(val):
            return val >= self.value
        else:
            return val == self.value
        
    def __repr__(self):
        '''this is just a helper method to print the question in a readable format'''
        condition = '=='
        if is_numeric(self.value):
            condition = '>='
        return 'Is %s %s %s' % (header[self.column], condition, str(self.value))
    
def partition(rows, question):
        true_rows, false_rows = [], []
        for row in rows:
            if question.match(row):
                true_rows.append(row)
            else:
                false_rows.append(row)
        return true_rows, false_rows
    
def gini(rows):
        ## calculate the Gini Impurityfor a list of rows
        counts = class_counts(rows)
        impurity = 1
        for lbl in counts:
            prob_of_lbl = counts[lbl]/float(len(rows))
            impurity -= prob_of_lbl**2
        return impurity
    
def info_gain(left, right, current_uncertainty):
        p = float(len(left))/(len(left)+len(right))
        return current_uncertainty - p * gini(left) - (1-p) * gini(right)
    
def find_best_split(rows):
        best_gain = 0
        best_question = None
        current_uncertainty = gini(rows)
        n_features = len(rows[0])
        
        for col in range(n_features):
            values = set([row[col] for row in rows])
            for val in values:
                question = Question(col, val)
                true_rows, false_rows = partition(rows, question)
                
                if len(true_rows) == 0 or len(false_rows) == 0:
                    continue
                    
                gain = info_gain(true_rows, false_rows, current_uncertainty)
                
                if gain >= best_gain:
                    best_gain, best_question = gain, question
        return best_gain, best_question

class Leaf:
    def __init__(self, rows):
        self.predictions = class_counts(rows)
        
class Decision_Node:
    def __init__(self, question, true_branch, false_branch):
        self.question = question
        self.true_branch = true_branch
        self.false_branch = false_branch

        
def build_tree(rows):
    gain, question = find_best_split(rows)
    if gain == 0:
        return Leaf(rows)
    true_rows, false_rows = partition(rows, question)
    true_branch = build_tree(true_rows)
    false_branch = build_tree(false_rows)
    return Decision_Node(question, true_branch, false_branch)

def print_tree(node, spacing=''):
    if isinstance(node, Leaf):
        print(spacing+'Predict', node.predictions)
        return
    print(spacing + str(node.question))
    print(spacing + '--> True:')
    print_tree(node.false_branch, spacing + " ")
    
def classify(row, node):
    if isinstance(node, Leaf):
        return node.predictions
    if node.question.match(row):
        return classify(row, node.true_branch)
    else:
        return classify(row, node.false_branch)
    
def print_leaf(counts):
    total = sum(counts.values())*1.0
    probs = {}
    for lbl in counts.keys():
        probs[lbl] = str(int(counts[lbl]/total*100))+ '%'
    return probs

if __name__ == '__main__':
    my_tree = build_tree(training_data)
    print_tree(my_tree)
    testing_data = [
        ['Green', 3, 'Mango'],
        ['Yellow', 3, 'Mango'],
        ['Red', 1, 'Grape'],
        ['Red', 1, 'Grape'],
        ['Yellow', 3, 'Lemon']
    ]
    
    for row in testing_data:
        print("Actual is: %s. Predicted is: %s "% (row[-1], print_leaf(classify(row, my_tree))))

        

Is label == Grape
--> True:
 Is label == Lemon
 --> True:
  Predict {'Mango': 2}
Actual is: Mango. Predicted is: {'Mango': '100%'} 
Actual is: Mango. Predicted is: {'Mango': '100%'} 
Actual is: Grape. Predicted is: {'Grape': '100%'} 
Actual is: Grape. Predicted is: {'Grape': '100%'} 
Actual is: Lemon. Predicted is: {'Lemon': '100%'} 


In [49]:
import numpy as np
import pandas as pd
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [60]:
loan = pd.read_csv('loan_data_final.csv')

In [61]:
loan.head()

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,1,debt_consolidation,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0
1,1,all_other,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0
2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0
4,1,all_other,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0


In [62]:
X = loan.drop('not.fully.paid', axis = 'columns')
y = loan['not.fully.paid']

In [63]:
for i in X:
    X[i] = pd.factorize(X[i])[0]

In [64]:
X.head()

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec
0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,1,1,1,1,1,1,1,1,1,0,0,0
2,0,0,2,2,2,2,2,2,2,2,1,0,0
3,0,0,3,3,0,3,3,3,3,3,1,0,0
4,0,1,4,4,3,4,4,4,4,4,0,1,0


In [65]:
clf = DecisionTreeClassifier()

In [68]:
from sklearn.model_selection import train_test_split

In [69]:
X_train, X_test, y_train, y_test = train_test_split(
...     X, y, test_size=0.30, random_state=40)

In [70]:
clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [71]:
pred = clf.predict(X_test)

In [74]:
confusion_matrix(y_test, pred)

array([[1979,  409],
       [ 378,  108]], dtype=int64)

In [75]:
accuracy_score(y_test, pred)

0.7261656228253306