In [97]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import random

In [98]:
df = pd.read_csv('C:/Users/prash/Desktop/cat1.csv')
df = df.rename(columns={"class": "label"})
df.drop(["Unnamed: 0", "galex_objid", "sdss_objid", "pred", "spectrometric_redshift"], axis=1, inplace=True)
df.columns

Index(['u', 'g', 'r', 'i', 'z', 'extinction_u', 'extinction_g', 'extinction_r',
       'extinction_i', 'extinction_z', 'nuv_mag', 'fuv_mag', 'nuv-u', 'nuv-g',
       'nuv-r', 'nuv-i', 'nuv-z', 'u-g', 'u-r', 'u-i', 'u-z', 'g-r', 'g-i',
       'g-z', 'r-i', 'r-z', 'i-z', 'fuv-nuv', 'fuv-u', 'fuv-g', 'fuv-r',
       'fuv-i', 'fuv-z', 'label'],
      dtype='object')

In [99]:
def train_test_split(df, test_size):
    
    if isinstance(test_size, float):
        test_size = round(test_size * len(df))

    indices = df.index.tolist()
    test_indices = random.sample(population=indices, k=test_size)

    test_df = df.loc[test_indices]
    train_df = df.drop(test_indices)
    
    return train_df, test_df

random.seed(0)
train_df, test_df = train_test_split(df, test_size=20)

In [100]:
def check_purity(data):
    label_column = data[:, -1]
    unique_classes = np.unique(label_column)

    if len(unique_classes) == 1:
        return True
    else:
        return False

In [101]:
def classify_data(data):
    
    label_column = data[:, -1]
    unique_classes, counts_unique_classes = np.unique(label_column, return_counts=True)

    index = counts_unique_classes.argmax()
    classification = unique_classes[index]
    
    return classification

In [102]:
def get_potential_splits(data):
    
    potential_splits = {}
    _, n_columns = data.shape
    for column_index in range(n_columns - 1):  
        potential_splits[column_index] = []
        values = data[:, column_index]
        unique_values = np.unique(values)

        for index in range(len(unique_values)):
            if index != 0:
                current_value = unique_values[index]
                previous_value = unique_values[index - 1]
                potential_split = (current_value + previous_value) / 2
                
                potential_splits[column_index].append(potential_split)
    
    return potential_splits

In [103]:
def split_data(data, split_column, split_value):
    
    split_column_values = data[:, split_column]

    data_below = data[split_column_values <= split_value]
    data_above = data[split_column_values >  split_value]
    
    return data_below, data_above

In [104]:
def calculate_entropy(data):
    
    label_column = data[:, -1]
    _, counts = np.unique(label_column, return_counts=True)

    probabilities = counts / counts.sum()
    entropy = sum(probabilities * -np.log2(probabilities))
     
    return entropy

In [105]:
def calculate_overall_entropy(data_below, data_above):
    
    n = len(data_below) + len(data_above)
    p_data_below = len(data_below) / n
    p_data_above = len(data_above) / n

    overall_entropy =  (p_data_below * calculate_entropy(data_below) 
                      + p_data_above * calculate_entropy(data_above))
    
    return overall_entropy

In [106]:
def determine_best_split(data, potential_splits):    
    overall_entropy = 9999
    for column_index in potential_splits:
        for value in potential_splits[column_index]:
            data_below, data_above = split_data(data, split_column=column_index, split_value=value)
            current_overall_entropy = calculate_overall_entropy(data_below, data_above)

            if current_overall_entropy <= overall_entropy:
                overall_entropy = current_overall_entropy
                best_split_column = column_index
                best_split_value = value
    
    return best_split_column, best_split_value

In [107]:
def decision_tree_algorithm(df, counter=0, min_samples=2, max_depth=5):
    
    # data preparations
    if counter == 0:
        global COLUMN_HEADERS
        COLUMN_HEADERS = df.columns
        data = df.values
    else:
        data = df   
        
    if (check_purity(data)) or (len(data) < min_samples) or (counter == max_depth):
        classification = classify_data(data)
        
        return classification

    
    # recursive part
    else:    
        counter += 1

        # helper functions 
        potential_splits = get_potential_splits(data)
        split_column, split_value = determine_best_split(data, potential_splits)
        data_below, data_above = split_data(data, split_column, split_value)
    
        feature_name = COLUMN_HEADERS[split_column]
        question = "{} <= {}".format(feature_name, split_value)
        sub_tree = {question: []}
        
        # find answers (recursion)
        yes_answer = decision_tree_algorithm(data_below, counter, min_samples, max_depth)
        no_answer = decision_tree_algorithm(data_above, counter, min_samples, max_depth)
        
        # If the answers are the same, then there is no point in asking the qestion.
        # This could happen when the data is classified even though it is not pure
        # yet (min_samples or max_depth base case).
        if yes_answer == no_answer:
            sub_tree = yes_answer
        else:
            sub_tree[question].append(yes_answer)
            sub_tree[question].append(no_answer)
        
        return sub_tree

In [108]:
tree = decision_tree_algorithm(train_df, max_depth=3)
print(tree)

{'g-z <= -0.049106121': [{'fuv_mag <= 20.9985981': [0.0, {'u-z <= 0.6195421219999999': [1.0, 0.0]}]}, {'u-z <= 1.626973152': [1.0, {'fuv-g <= -3.8145627975': [0.0, 1.0]}]}]}


In [110]:
def classify_example(example, tree):
    question = list(tree.keys())[0]
    feature_name, comparison_operator, value = question.split(" ")

    # ask question
    if example[feature_name] <= float(value):
        answer = tree[question][0]
    else:
        answer = tree[question][1]

    # base case
    if not isinstance(answer, dict):
        return answer
    
    # recursive part
    else:
        residual_tree = answer
        return classify_example(example, residual_tree)

In [111]:
example = test_df.iloc[0]
print(example)
classify_example(example, tree)

u               20.005760
g               19.861366
r               19.535606
i               19.525618
z               19.561289
extinction_u     0.071131
extinction_g     0.055425
extinction_r     0.038343
extinction_i     0.028493
extinction_z     0.021193
nuv_mag         20.169531
fuv_mag         21.320335
nuv-u            0.163771
nuv-g            0.308165
nuv-r            0.633924
nuv-i            0.643913
nuv-z            0.608242
u-g              0.144394
u-r              0.470154
u-i              0.480143
u-z              0.444471
g-r              0.325760
g-i              0.335749
g-z              0.300077
r-i              0.009989
r-z             -0.025682
i-z             -0.035671
fuv-nuv         -1.150805
fuv-u           -1.314575
fuv-g           -1.458969
fuv-r           -1.784729
fuv-i           -1.794718
fuv-z           -1.759047
label            1.000000
Name: 394, dtype: float64


1.0

In [112]:
def calculate_accuracy(df, tree):

    df["classification"] = df.apply(classify_example, axis=1, args=(tree,))
    df["classification_correct"] = df["classification"] == df["label"]
    
    accuracy = df["classification_correct"].mean()
    
    return accuracy

In [113]:
accuracy = calculate_accuracy(test_df, tree)
accuracy

1.0