In [1]:
import numpy as np
import pandas as pd
import random

In [2]:
def bootstrapdf(df):
    df = df.sample(frac=1, replace=True)
    return df

In [3]:
def check_for_leaf(df,counter, min_samples, max_depth):
    unique_classes = np.unique(df)
    if len(unique_classes) == 1 or len(df)<=min_samples or counter==max_depth:
        labelcol = df
        uniq_cls, cnt = np.unique(labelcol, return_counts=True)
        classification = unique_classes[cnt.argmax()]
        return classification
    else:
        return False 

In [4]:
def gini_imp_test(df, col_index):
    df.reset_index(inplace = True, drop = True)
    classes = df.iloc[:,-1]
    feature = df.iloc[:,col_index]
    if len(feature.unique()) == 2:
        gini_imp = 0
        for i in np.unique(feature):
            idx = np.where(feature == i)
            label = classes.loc[idx].values
            a, b = np.unique(label, return_counts = True)
            list1 = [(i/sum(b))**2 for i in b]
            prob = 1 - sum(list1)
            wt = len(idx[0]) / df.shape[0]
            gini_imp += wt * prob
        return gini_imp, i
    else:
        label = np.sort(feature.unique())[1:-1]
        best_gini_imp = float('inf')
        split_val = 0
        for i in label:
            idx1 = np.where(feature > i)
            idx2 = np.where(feature <= i)
            if len(idx1[0]) > 2 and len(idx2[0]) > 2:
        
                b1, b1cnt = np.unique(classes.loc[idx1].values, return_counts = True)
                b2, b2cnt = np.unique(classes.loc[idx2].values, return_counts = True)
                list1 = [(i/sum(b1cnt))**2 for i in b1cnt]
                list2 = [(i/sum(b2cnt))**2 for i in b2cnt]
                prob1 = 1 - sum(list1)
                prob2 = 1 - sum(list2)
                gini = ((sum(b1cnt)/df.shape[0])*prob1) + ((sum(b2cnt)/df.shape[0])*prob2) 
                if gini < best_gini_imp:
                    best_gini_imp = gini
                    split_val = i
                else:
                    continue 
        return best_gini_imp, split_val

In [5]:
def best_node(df, col_list):
    best_gini_imp = float('inf')
    value = 0
    col = 0
    for i in col_list:
        gini, val = gini_imp_test(df, i) 
        if gini < best_gini_imp:
            best_gini_imp = gini
            value = val
            col = i
    return col, value

In [6]:
def split_df(df, col_index, split_val):
    feature = df.iloc[:,col_index]
    if feature.dtypes == object:
        temp1 = df[df.iloc[:,col_index] == split_val]
        temp2 = df[df.iloc[:,col_index] != split_val]
        return temp1, temp2
    elif feature.dtypes != object:
        temp1 = df[df.iloc[:,col_index] <= split_val]
        temp2 = df[df.iloc[:,col_index] >= split_val]
        temp1.reset_index(inplace = True, drop = True)
        temp2.reset_index(inplace = True, drop = True)
        return temp1, temp2

In [7]:
def check_purity(data):
    
    label_column = data[:, -1]
    unique_classes = np.unique(label_column)

    if len(unique_classes) == 1:
        return True
    else:
        return False

In [8]:
def classify_data(data):
    
    label_column = data[:,-1]
    unique_classes, counts_unique_classes = np.unique(label_column, return_counts=True)
    index = counts_unique_classes.argmax()
    classification = unique_classes[index]
    return classification
    

In [9]:
def metrics(ts_lb,answer):
    TN = 0
    TP = 0
    FN = 0
    FP = 0
    for i,j in zip(ts_lb,answer):
        if j==1 and i==1:
            TP += 1
        elif(j==1 and i==0):
            FN += 1
        elif(j==0 and i==1):
            FP += 1
        elif(j==0 and i==0):
            TN += 1
    Accuracy = (TP + TN)/(TP + FP + TN + FN)
    Precision = TP/(TP + FP)
    Recall = TP/(TP + FN)
    f1_score = (2*Precision*Recall)/(Precision + Recall)
    return Accuracy, Precision, Recall, f1_score

In [10]:
def decision_tree(df, columns, num_features, counter = 0, min_samples = 10, max_depth = 5):
    if (check_purity(df.values)) or (counter == max_depth) or (len(df) < min_samples):
        classification = classify_data(df.values)
        
        return classification
      
    else:
        counter += 1
        col_list = random.sample(columns, num_features)
        column, value = best_node(df, col_list)
        if df.iloc[:,column].dtype == object:
            columns.remove(column)
        branch1, branch2 = split_df(df, column, value)
        if len(branch1) == 0 or len(branch2) == 0:
            classification = classify_data(df.values)
            return classification
    
        query = "{} <= {}".format(column, value)
        branch = {query: []}

        left_branch = decision_tree(branch1, columns, num_features, counter)
        right_branch = decision_tree(branch2, columns, num_features, counter)

        if left_branch == right_branch:
            branch = left_branch
        else:
            branch[query].append(left_branch)
            branch[query].append(right_branch)
        return branch

In [11]:
def random_forest(df, num_trees, num_features):
    trees = []
    for i in range(num_trees):
        df = bootstrapdf(df)
        columns = list(df.iloc[:,:-1].columns)
        tree = decision_tree(df, columns, num_features)
        trees.append(tree)
    return trees

In [12]:
def predict(model, test_data):
    classes = []
    for tree in model:
        cls = []
        for i in range(len(test_data)):
            t = tree
            col,_,val = list(t.keys())[0].split()
            col = int(col)
            try:
                val = float(val)
            except:
                val = str(val)
            key = list(t.keys())[0]
            key_val = t[key]
            while True: 
                if test_data.iloc[i,col] <= val:
                    t = t[key][0]
                    if type(t) != dict:
                        cls.append(t)
                        break
                    else:
                        col,_,val = list(t.keys())[0].split()
                        col = int(col)
                        try:
                            val = float(val)
                        except:
                            val = str(val)
                        key = list(t.keys())[0]
                        key_val = t[key]
                else:
                    t = t[key][1]
                    if type(t) != dict:
                        cls.append(t)
                        break
                    else:
                        col,_,val = list(t.keys())[0].split()
                        col = int(col)
                        try:
                            val = float(val)
                        except:
                            val = str(val)
                        key = list(t.keys())[0]
                        key_val = t[key]
        cls = [int(i) for i in cls]
        classes.append(cls)
    classes = np.array(classes)
    final_class = []
    for i in range(len(test_data)):
        unique_classes, counts_unique_classes = np.unique(classes[:,i], return_counts=True)
        index = counts_unique_classes.argmax()
        classification = unique_classes[index]
        final_class.append(classification)
    final_class
    test_data["Class"] = final_class
    return test_data

In [13]:
def k_fold(df):
    num_trees = int(input("Enter number of trees: "))
    num_features = int(input("Enter number of features for each split: "))
    k = int(input("Enter k value: "))
    metrics_list = []
    for i in range(k):
        splitdfs = np.array_split(df, k)
        test = splitdfs[i]
        del(splitdfs[i])
        train = pd.concat(splitdfs)
        test.reset_index(inplace = True, drop = True)
        train.reset_index(inplace = True, drop = True) 
        actual = test.iloc[:,-1]
        test = test.iloc[:,:-1]
        model = random_forest(train, num_trees, num_features)
        results = predict(model, test)
        Accuracy, Precision, Recall, f1_score = metrics(actual, results["Class"])
        metrics_list.append([Accuracy, Precision, Recall, f1_score])
    metrics_list = np.array(metrics_list)
    metrics_list = np.mean(metrics_list, axis = 0)
    print("Accuracy: ",metrics_list[0])
    print("Precision: ",metrics_list[1])
    print("Recall: ",metrics_list[2])
    print("f1_score: ",metrics_list[3])
    return metrics_list
    
    

In [14]:
df1 = pd.read_csv("project3_dataset1.txt", sep = '\t', header=None)
k_fold(df1)

Enter number of trees: 5
Enter number of features for each split: 3
Enter k value: 10
Accuracy:  0.9348997493734335
Precision:  0.8793594721240956
Recall:  0.9506969696969698
f1_score:  0.9120824299912176


array([0.93489975, 0.87935947, 0.95069697, 0.91208243])

In [15]:
df2 = pd.read_csv("project3_dataset2.txt", sep = '\t', header=None)
k_fold(df2)

Enter number of trees: 5
Enter number of features for each split: 3
Enter k value: 10
Accuracy:  0.6406567992599445
Precision:  0.36184248500037974
Recall:  0.4885966810966812
f1_score:  0.39605076241748566


array([0.6406568 , 0.36184249, 0.48859668, 0.39605076])