# Decision Tree

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split 
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO 
from IPython.display import Image 
import pydotplus
import warnings
warnings.filterwarnings('ignore')



In [None]:
data = pd.read_excel('duplicate_data.xlsx')
data['missing_factor'] = data.isna().sum(1)
data1 = data[data['Y_actual'].notnull()]

In [7]:
X = data1[['LD_company', 'LD_customer_name', 'LD_another_personal_info', 'LD_address','personal_info_missing', 'customer_name_missing',
        'another_personal_info_missing', 'address_info']]
y = data1.Y_actual

X['LD_company'] = X.LD_company.astype(str).str.replace("Levenshtine Distance couldn't be calculated", '1')
X['LD_customer_name'] = X.LD_customer_name.astype(str).str.replace("Levenshtine Distance couldn't be calculated", '1')
X['LD_another_personal_info'] = X.LD_another_personal_info.astype(str).str.replace("Levenshtine Distance couldn't be calculated", '1')
X['LD_address'] = X.LD_address.astype(str).str.replace("Levenshtine Distance couldn't be calculated", '1')
X['address_info'] = X.address_info.astype(str).str.replace("Levenshtine Distance couldn't be calculated", '1')

X['LD_company'] = pd.to_numeric(X.LD_company)
X['LD_customer_name'] = pd.to_numeric(X.LD_customer_name)
X['LD_another_personal_info'] = pd.to_numeric(X.LD_another_personal_info)
X['LD_address'] = pd.to_numeric(X.LD_address)
X['address_info'] = pd.to_numeric(X.address_info)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [10]:
dt = DecisionTreeClassifier(max_leaf_nodes=5)
dt.fit(X_train, y_train)
print("Train:", dt.score(X_train, y_train))
print("Test:", dt.score(X_test, y_test))

Train: 0.9829376854599406
Test: 0.9774436090225563


In [None]:
dot_data = StringIO()
export_graphviz(dt, out_file=dot_data, filled=True, rounded=True,
                special_characters=True,feature_names = X.columns,class_names=['0','1'])

graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) 
graph.write_png('Duplicate_DT.png')
Image(graph.create_png())

In [14]:
preds = dt.predict(X_test)

In [16]:
def accuracy_metrics(y_test, preds):
    TP = 0
    FP = 0
    TN = 0
    FN = 0
    preds = list(preds)
    y_test = list(y_test)
    for i in range(len(preds)):
        if y_test[i] == preds[i]:
            if y_test[i] == 0:
                TN += 1
            else:
                TP += 1
        else:
            if y_test[i] == 0:
                FP += 1
            else:
                FN += 1

    precision = TP/(TP +FP)
    recall = TP/ (TP + FN)
    accuracy = (TP + TN)/(TP +TN +FP +FN)
    f1 = 2*(precision * recall)/(precision + recall)
    print('Accuracy: ', accuracy, ' Precision: ', precision, ' Recall: ', recall, 'F1: ', f1)

In [17]:
accuracy_metrics(y_test=y_test, preds=preds)

Accuracy:  0.9774436090225563  Precision:  0.9696969696969697  Recall:  0.9846153846153847 F1:  0.9770992366412214
