**Name-**Akash Verma

**Roll no.-**2018IMT-012

**Course-** DM Lab

**Program-3**

In [None]:
import pandas as pd
import numpy as np
import random as rnd
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split

In [None]:
targetEntropy = 0

In [None]:
class treeNode():
    def __init__(self, col=-1, colName='', value=None, results=None, rb=None, lb=None):
        self.col = col 
        self.colName = colName; #name of the column the node represents
        self.value = value #value of the node
        self.results = results #results that are stored in the node
        self.rb = rb #the right children of the node
        self.lb= lb #the left children of the node

In [None]:
def partitionData(dataFrame, test_percentage):
    tot_index = range(len(dataFrame)) 
    test_indexes = rnd.sample(tot_index, int(test_percentage * len(dataFrame))) 
    train_indexes = list(set(tot_index) ^ set(test_indexes)) 

    # use the indexes generated to build up dataframes
    test_df = dataFrame.loc[test_indexes]
    train_df = dataFrame.loc[train_indexes]

    return train_df, test_df

In [None]:
def getUniqueClasses(dataFrameColumn):
    results = {}
    for row in dataFrameColumn:
        if row not in results: results[row] = 0 # initialise element in dictionary if not already there
        results[row] += 1 # increment value

    return results

In [None]:
def getEntropy(data, column):
    entropy = 0.0
    results = getUniqueClasses(data[column]) # get # of classes in data

    for row in results.values():
        p = float(row) / len(data[column]) # calculate probability value for each element in results
        entropy -= p * np.log2(p) # calculate entropy

    return entropy

In [None]:
def findSplitPoints(data, column):
    sorted = data.sort_values([column], ascending=True)
    sorted_matrix = sorted[[column, 'Category']].to_numpy()
    splitPoints = []
    previous = sorted_matrix[0][1] # get target of the first element in sorted matrix
    index = sorted.index.values; # get the indexes of each  element in the sorted matrix
    counter = 0
    for row in sorted_matrix:
        if row[1] != previous: 
            splitPoints.append([index[counter - 1], sorted_matrix[counter - 1][0]])
        counter += 1
        previous = row[1]

    return splitPoints

In [None]:
def splitSets(data, column, splitPoints):
    sets_below = []
    sets_above = []
    # split the dataframe into 2 for each splitpoint
    for i in range(len(splitPoints)):
        df1 = data[data[column] <= data[column][splitPoints[i][0]]]  # everything below the splitpoint
        df2 = data[data[column] > data[column][splitPoints[i][0]]]  # everything above it
        # add to the lists
        sets_below.append(df1)
        sets_above.append(df2)

    return sets_below, sets_above

In [None]:
def getInformationGain(data, column):
    splitpoints = findSplitPoints(data, column)  # get splitpoints for this column
    sets_below, sets_above = splitSets(data, column, splitpoints)  # split the data into sets based on these splitpoints
    # lists to store the # of instances in each subset that are above and below each given threshold and their entropies
    instances_above = []
    instances_below = []
    entropy_above = []
    entropy_below = []
    target_entropy = getEntropy(data, 'Category')  # get target entropy for the dataset
    # get entropy for sets above and below each of the thresholds
    for set in sets_below:
        entropy_below.append(getEntropy(set, 'Category'))
        instances_below.append(len(set))
    for set in sets_above:
        entropy_above.append(getEntropy(set, 'Category'))
        instances_above.append(len(set))

    totalInstances = []
    infoGains = []
    # work out the Information Gain for each threshold
    for i in range(len(instances_below)):
        totalInstances.append(instances_below[i] + instances_above[i])
        probA = (instances_above[i] / float(totalInstances[i]))
        probB = (instances_below[i] / float(totalInstances[i]))
        infoGains.append(target_entropy - ((entropy_below[i] * probB) + (entropy_above[i] * probA)))

    # work out the highest information gain for this column of the dataset
    best_gain = i = counter = 0
    for gain in infoGains:
        if best_gain < gain:
            best_gain = gain
            counter = i # variable to hold the index in the list where the best gain occurs
        i += 1

    return best_gain, sets_below[counter], sets_above[counter], splitpoints[counter]

In [None]:
def train(data):
    optimal_gain = -1
    best = {}
    columns = []
    i = 0

    for column in data:  # loop over each attribute
        if column != 'Category':
            try:
                ig, set1, set2, split = getInformationGain(data, column)  # get information gain for each column
                # column holds information that is used when creating a tree node.
                # the values in each of the columns will be used below when creating nodes for the tree
                columns.append({"ig": ig, "left": set1, "right": set2, 'col': i, 'split': split,'colName': column}) 

            # above code will work until the set1 and set2 values that would be returned bu the information Gain function will be 0
            # in that case, an indexError will be thrown as we cannot access element 0 of the sets lists
            # so if we catch that exception we know this data should be used as a leaf node and can format the tree information accordingly
            except IndexError:
                columns.append({"ig": 0, "left": [], "right": [], 'col': column, })
        i += 1  # counter to get int value for row(used for tree node)

    # loops through each column and pulls out the one with the best information gain for the given data
    for val in range(len(columns)):

        if columns[val]['ig'] > optimal_gain:
            best = columns[val]
            optimal_gain = columns[val]['ig']

    # get data for left branch and data for right branch
    left = best['left']
    right = best['right']
    # check if we have data for the left and right branches of the tree
    # if they are = 0 it is the stop condition for recursion, and the else block will generate a leaf node for the tree
    if len(best['left']) != 0 and len(best['right']) != 0:
        return (treeNode(col=best['col'], colName=best['colName'], value=best['split'][1], results=None,
                              rb=train(right), lb=train(left)))

    else:
        label = list(getUniqueClasses(data['Category']).keys()); #get label for the leaf node
        return (treeNode(results=label[0]))

In [None]:
def classify(target_row, tree):
    # base case to stop recursion -> we are at a leaf node
    if tree.results != None:
        return tree.results

    else:
        # gets the attribute from the target row that we are looking at
        val = target_row[tree.col]
        branch = None
        if isinstance(val, int) or isinstance(val, float):
            # checks the value of the tree against the value of the attribute from the target row
            # go down right side
            if val >= tree.value:
                branch = tree.rb
            # go down left side
            else:
                branch = tree.lb
        # recur over the tree again, using either the left or right branch to determine where to go next
        return classify(target_row, branch)


In [None]:
def printTree(tree, space=''):
    # Leaf node
    if tree.results != None:
        print(str(tree.results))

    else:
        print(str(tree.colName) + ' : ' + str(tree.value)) #name and splitpoint of current node
        print(space + 'L ', end="") #Print 'L' for left child
        printTree(tree.lb, space + ' ') #Print left child
        print(space + 'R ', end="") #Print 'R' for right child
        printTree(tree.rb, space + ' ') #Print right child

In [None]:
values = []
def test_tree(data, labels, tree):
   
    # Loop over each row in test data frame and get the classification result for each index
    for index, row in data.iterrows():
        values.append([index, classify(row, tree)])

    # Get the indexes from the test dataframe where each type occurs
    indexes = labels.index.values
    correct = incorrect = 0
    # Loop over values list and compare the class that was classified by the tree
    # and the class that was originally in the dataframe
    for l in range(len(values)):
        if values[l][0] == indexes[l] and values[l][1] == labels[indexes[l]]:
            correct += 1 #increment the correctly classified #
        else:
            incorrect += 1 #increment the incorrectly classified #

    return incorrect, correct, np.round(100 - (incorrect / (incorrect + correct)) * 100)

In [None]:
dataSet = pd.read_csv('dmml dataset.csv')

In [None]:
dataSet.isnull().sum()

Unnamed: 0     0
Category       0
Age            0
Sex            0
ALB            1
ALP           18
ALT            1
AST            0
BIL            0
CHE            0
CHOL          10
CREA           0
GGT            0
PROT           1
dtype: int64

In [None]:
dataSet.drop('Unnamed: 0',axis='columns',inplace=True)

In [None]:
dataSet.dropna(inplace=True)

In [None]:
dataSet.head()

Unnamed: 0,Category,Age,Sex,ALB,ALP,ALT,AST,BIL,CHE,CHOL,CREA,GGT,PROT
0,0=Blood Donor,32,m,38.5,52.5,7.7,22.1,7.5,6.93,3.23,106.0,12.1,69.0
1,0=Blood Donor,32,m,38.5,70.3,18.0,24.7,3.9,11.17,4.8,74.0,15.6,76.5
2,0=Blood Donor,32,m,46.9,74.7,36.2,52.6,6.1,8.84,5.2,86.0,33.2,79.3
3,0=Blood Donor,32,m,43.2,52.0,30.6,22.6,18.9,7.33,4.74,80.0,33.8,75.7
4,0=Blood Donor,32,m,39.2,74.1,32.6,24.8,9.6,9.15,4.32,76.0,29.9,68.7


In [None]:
df=dataSet.Sex.map({'m':0,'f':1})

In [None]:
df.count()

589

In [None]:
dataSet.isnull().sum()

Category    0
Age         0
Sex         0
ALB         0
ALP         0
ALT         0
AST         0
BIL         0
CHE         0
CHOL        0
CREA        0
GGT         0
PROT        0
dtype: int64

In [None]:
dataSet.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 589 entries, 0 to 612
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Category  589 non-null    object 
 1   Age       589 non-null    int64  
 2   Sex       589 non-null    object 
 3   ALB       589 non-null    float64
 4   ALP       589 non-null    float64
 5   ALT       589 non-null    float64
 6   AST       589 non-null    float64
 7   BIL       589 non-null    float64
 8   CHE       589 non-null    float64
 9   CHOL      589 non-null    float64
 10  CREA      589 non-null    float64
 11  GGT       589 non-null    float64
 12  PROT      589 non-null    float64
dtypes: float64(10), int64(1), object(2)
memory usage: 64.4+ KB


In [None]:
dataSet.Category.unique()

array(['0=Blood Donor', '0s=suspect Blood Donor', '1=Hepatitis',
       '2=Fibrosis', '3=Cirrhosis'], dtype=object)

In [None]:
df1=dataSet.Category.map({'0=Blood Donor':1,'0s=suspect Blood Donor':2,'1=Hepatitis':3,'2=Fibrosis':4,'3=Cirrhosis':5})

In [None]:
df1.count()

589

In [None]:
dataSet.head()

Unnamed: 0,Category,Age,Sex,ALB,ALP,ALT,AST,BIL,CHE,CHOL,CREA,GGT,PROT
0,0=Blood Donor,32,m,38.5,52.5,7.7,22.1,7.5,6.93,3.23,106.0,12.1,69.0
1,0=Blood Donor,32,m,38.5,70.3,18.0,24.7,3.9,11.17,4.8,74.0,15.6,76.5
2,0=Blood Donor,32,m,46.9,74.7,36.2,52.6,6.1,8.84,5.2,86.0,33.2,79.3
3,0=Blood Donor,32,m,43.2,52.0,30.6,22.6,18.9,7.33,4.74,80.0,33.8,75.7
4,0=Blood Donor,32,m,39.2,74.1,32.6,24.8,9.6,9.15,4.32,76.0,29.9,68.7


In [None]:
dataSet.drop('Sex',axis='columns',inplace=True)

In [None]:
dataSet.drop('Category',axis='columns',inplace=True)

In [None]:
dataSet=pd.concat([df,dataSet,df1],axis=1)

In [None]:
dataSet.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 589 entries, 0 to 612
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Sex       589 non-null    int64  
 1   Age       589 non-null    int64  
 2   ALB       589 non-null    float64
 3   ALP       589 non-null    float64
 4   ALT       589 non-null    float64
 5   AST       589 non-null    float64
 6   BIL       589 non-null    float64
 7   CHE       589 non-null    float64
 8   CHOL      589 non-null    float64
 9   CREA      589 non-null    float64
 10  GGT       589 non-null    float64
 11  PROT      589 non-null    float64
 12  Category  589 non-null    int64  
dtypes: float64(10), int64(3)
memory usage: 64.4 KB


In [None]:
dataSet.isnull().sum()

Sex         0
Age         0
ALB         0
ALP         0
ALT         0
AST         0
BIL         0
CHE         0
CHOL        0
CREA        0
GGT         0
PROT        0
Category    0
dtype: int64

In [None]:
results = [];
tests = 1 
train_data, test_data = train_test_split(dataSet, test_size=0.3)
tree = train(train_data) # make tree
types = test_data['Category'] # get types column from test_data
        

incorrect, correct, accuracy = test_tree(test_data, types, tree) # test the tree
results.append(accuracy)

        # print information to console

print("Tree Generated:" + "\n")
printTree(tree)
print()
print("Correctly Classified: " + str(correct) + " / " + str(correct+incorrect))
print("Accuracy: " + str(accuracy))
print()

sum = 0
for r in range(len(results)):
    sum += results[r]
average = sum/tests

print("Average Accuracy after " + str(tests) + " runs")
print(average)
    #test_start
    
y_test=types
print(y_test)
    
vp=values
from pandas import DataFrame
df = DataFrame(values,columns=['index','pred_Target'])
app=df['pred_Target']
y_pred = app
print(y_pred)

from sklearn.metrics import classification_report, confusion_matrix,accuracy_score
print(confusion_matrix(y_test, y_pred))
print("Accuracy:",accuracy_score(y_test,y_pred)*100)
print(classification_report(y_test, y_pred))


Tree Generated:

AST : 52.6
L ALT : 9.5
 L ALP : 52.1
  L Age : 50
   L ALB : 21.6
    L 2
    R 3
   R 4
  R ALB : 35.0
   L 5
   R 1
 R ALP : 37.9
  L ALB : 40.5
   L 1
   R 3
  R 1
R CHE : 5.77
 L 5
 R ALP : 39.6
  L Age : 41
   L 3
   R ALB : 43.0
    L 3
    R 4
  R CHE : 7.7
   L Age : 51
    L 1
    R 3
   R Age : 54
    L 4
    R 2

Correctly Classified: 160 / 177
Accuracy: 90.0

Average Accuracy after 1 runs
90.0
303    1
45     1
137    1
199    1
202    1
      ..
376    1
182    1
309    1
349    1
183    1
Name: Category, Length: 177, dtype: int64
0      1
1      1
2      1
3      1
4      1
      ..
172    1
173    1
174    1
175    1
176    1
Name: pred_Target, Length: 177, dtype: int64
[[150   0   2   3   1]
 [  3   1   0   0   1]
 [  2   0   3   1   1]
 [  0   0   1   1   0]
 [  0   0   1   1   5]]
Accuracy: 90.3954802259887
              precision    recall  f1-score   support

           1       0.97      0.96      0.96       156
           2       1.00      0.20    