In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt  
import warnings
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
import random
import time
%matplotlib inline


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
warnings.filterwarnings('ignore') 
data = '/kaggle/input/adult-dataset/adult.csv' 
df = pd.read_csv(data, header=None, sep=',',names=['Age','WorkClass','fnlwgt', 'Education','YearsOfEd', 'Marital', 'Occupation', 'Relation', 'Race', 'Gender','Gain','Loss','HPR','Country','income']) 
df

In [None]:
df = pd.get_dummies(df, columns=['WorkClass', 'Education', 'Marital', 'Occupation', 'Relation', 'Race', 'Gender','Country'])
df

In [None]:
def missing_value_funder(data):
    for i in data.columns:
        NAN = np.sum(pd.isna(data[i]))
        if NAN>0:
          print('column {} has missing values'.format(i))
        else:
          print('column {} has not any missing values'.format(i))


missing_value_funder(df)

In [None]:
col = list(df.columns)
column = []
for i in col:
    if i!='income':
        column.append(i)
column.append('income')
df = df[column]
df

In [None]:
def trainTestSplit(dataFrame, testSize):
    if isinstance(testSize, float):
        testSize = round(testSize * len(dataFrame))
    indices = dataFrame.index.tolist()
    testIndices = random.sample(population = indices, k = testSize)
    dataFrameTest = dataFrame.loc[testIndices]
    dataFrameTrain = dataFrame.drop(testIndices)
    return dataFrameTrain, dataFrameTest

def checkPurity(data):
    if len(np.unique(data[:, -1])) == 1:
        return True
    else:
        return False

def classifyData(data):
    uniqueClasses, uniqueClassesCounts = np.unique(data[:, -1], return_counts = True)
    return uniqueClasses[uniqueClassesCounts.argmax()]

def getPotentialSplits(data, randomAttributes):
    potentialSplits = {}
    _, columns = data.shape
    columnsIndices = list(range(columns - 1))
    if randomAttributes != None  and len(randomAttributes) <= len(columnsIndices):
        columnsIndices = randomAttributes
    for column in columnsIndices:
        values = data[:, column]
        uniqueValues = np.unique(values)
        if len(uniqueValues) == 1:
            potentialSplits[column] = uniqueValues
        else:
            potentialSplits[column] = []
            for i in range(len(uniqueValues)):
                if i != 0:
                    currentValue = uniqueValues[i]
                    previousValue = uniqueValues[i - 1]
                    potentialSplits[column].append((currentValue + previousValue) / 2)
    return potentialSplits

def splitData(data, splitColumn, splitValue):
    splitColumnValues = data[:, splitColumn]
    return data[splitColumnValues <= splitValue], data[splitColumnValues > splitValue]

def calculateEntropy(data):
    _, uniqueClassesCounts = np.unique(data[:, -1], return_counts = True)
    probabilities = uniqueClassesCounts / uniqueClassesCounts.sum()
    return sum(probabilities * -np.log2(probabilities))

def calculateOverallEntropy(dataBelow, dataAbove):
    pDataBelow = len(dataBelow) / (len(dataBelow) + len(dataAbove))
    pDataAbove = len(dataAbove) / (len(dataBelow) + len(dataAbove))
    return pDataBelow * calculateEntropy(dataBelow) + pDataAbove * calculateEntropy(dataAbove)

def determineBestSplit(data, potentialSplits, randomSplits = None):
    overallEntropy = 9999
    bestSplitColumn = 0
    bestSplitValue = 0
    if randomSplits == None:
        for splitColumn in potentialSplits:
            for splitValue in potentialSplits[splitColumn]:
                dataBelow, dataAbove = splitData(data, splitColumn, splitValue)
                currentOverallEntropy = calculateOverallEntropy(dataBelow, dataAbove)
                if currentOverallEntropy <= overallEntropy:
                    overallEntropy = currentOverallEntropy
                    bestSplitColumn = splitColumn
                    bestSplitValue = splitValue
    else:
        for i in range(randomSplits):
            randomSplitColumn = random.choice(list(potentialSplits))
            randomSplitValue = random.choice(potentialSplits[randomSplitColumn])
            dataBelow, dataAbove = splitData(data, randomSplitColumn, randomSplitValue)
            currentOverallEntropy = calculateOverallEntropy(dataBelow, dataAbove)
            if currentOverallEntropy <= overallEntropy:
                overallEntropy = currentOverallEntropy
                bestSplitColumn = randomSplitColumn
                bestSplitValue = randomSplitValue
    return bestSplitColumn, bestSplitValue

def buildDecisionTree(dataFrame, currentDepth = 0, minSampleSize = 2, maxDepth = 1000, randomAttributes = None, randomSplits = None):
    if currentDepth == 0:
        global COLUMN_HEADERS
        COLUMN_HEADERS = dataFrame.columns
        data = dataFrame.values
        if randomAttributes != None and randomAttributes <= len(COLUMN_HEADERS) - 1:
            randomAttributes = random.sample(population = list(range(len(COLUMN_HEADERS) - 1)), k = randomAttributes)
        else:
            randomAttributes = None
    else:
        data = dataFrame
    if checkPurity(data) or len(data) < minSampleSize or currentDepth == maxDepth:
        return classifyData(data)
    else:
        currentDepth += 1
        potentialSplits = getPotentialSplits(data, randomAttributes)
        splitColumn, splitValue = determineBestSplit(data, potentialSplits, randomSplits)
        dataBelow, dataAbove = splitData(data, splitColumn, splitValue)
        if len(dataBelow) == 0 or len(dataAbove) == 0:
            return classifyData(data)
        else:
            question = str(COLUMN_HEADERS[splitColumn]) + " <= " + str(splitValue)
            decisionSubTree = {question: []}
            yesAnswer = buildDecisionTree(dataBelow, currentDepth, minSampleSize, maxDepth, randomAttributes, randomSplits)
            noAnswer = buildDecisionTree(dataAbove, currentDepth, minSampleSize, maxDepth, randomAttributes, randomSplits)
            if yesAnswer == noAnswer:
                decisionSubTree = yesAnswer
            else:
                decisionSubTree[question].append(yesAnswer)
                decisionSubTree[question].append(noAnswer)
            return decisionSubTree

def classifySample(sample, decisionTree):
    if not isinstance(decisionTree, dict):
        return decisionTree
    question = list(decisionTree.keys())[0]
    attribute, value = question.split(" <= ")
    if sample[attribute] <= float(value):
        answer = decisionTree[question][0]
    else:
        answer = decisionTree[question][1]
    return classifySample(sample, answer)

def decisionTreePredictions(dataFrame, decisionTree):
    predictions = dataFrame.apply(classifySample, axis = 1, args = (decisionTree,))
    return predictions

def calculateAccuracy(predictedResults, category):
    resultCorrect = predictedResults == category
    return resultCorrect.mean()

In [None]:
dataFrame = df
dataFrameTrain, dataFrameTest = trainTestSplit(dataFrame, testSize = 0.3)

print("Decision Tree - Adult Dataset")

i = 1
accuracyTrain = 0
while i < 5:
    decisionTree = buildDecisionTree(dataFrameTrain, maxDepth = i)
    decisionTreeTestResults = decisionTreePredictions(dataFrameTest, decisionTree)
    accuracyTest = calculateAccuracy(decisionTreeTestResults, dataFrameTest.iloc[:, -1]) * 100
    decisionTreeTrainResults = decisionTreePredictions(dataFrameTrain, decisionTree)
    accuracyTrain = calculateAccuracy(decisionTreeTrainResults, dataFrameTrain.iloc[:, -1]) * 100
    print("maxDepth = {}: ".format(i), end = "")
    print("accTest = {0:.2f}%, ".format(accuracyTest), end = "")
    print("accTrain = {0:.2f}%, ".format(accuracyTrain), end = "")
    i += 1

In [None]:
dataFrameTrain_main = dataFrameTrain['income'].to_numpy()
dataFrameTrain_main_new = []
for i in dataFrameTrain_main:
    if i == '<=50K':
        dataFrameTrain_main_new.append(0)
    else :
        dataFrameTrain_main_new.append(1)


dataFrameTest_main = dataFrameTest['income'].to_numpy()
dataFrameTest_main_new = []
for i in dataFrameTest_main:
    if i == '<=50K':
        dataFrameTest_main_new.append(0)
    else :
        dataFrameTest_main_new.append(1)


decisionTreeTrainResults_np = decisionTreeTrainResults.to_numpy()
decisionTreeTrainResults_new = []
for i in decisionTreeTrainResults_np:
    if i == '<=50K':
        decisionTreeTrainResults_new.append(0)
    else :
        decisionTreeTrainResults_new.append(1)


decisionTreeTestResults_np = decisionTreeTestResults.to_numpy()
decisionTreeTestResults_new = []
for i in decisionTreeTestResults_np:
    if i == '<=50K':
        decisionTreeTestResults_new.append(0)
    else :
        decisionTreeTestResults_new.append(1)

In [None]:
precision_train = precision_score(dataFrameTrain_main_new,decisionTreeTrainResults_new)
recall_train = recall_score(dataFrameTrain_main_new,decisionTreeTrainResults_new)
precision_test = precision_score(dataFrameTest_main_new,decisionTreeTestResults_new)
recall_test = recall_score(dataFrameTest_main_new,decisionTreeTestResults_new)

print('for train : precision = {} \n recall = {}'.format(precision_train,recall_train))
print('for test : precision = {} \n recall = {}'.format(precision_test,recall_test))