In [1]:
from pandas import read_csv, cut
from numpy import array, unique, log2, inf, append, where, square, random, concatenate
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
import time
from pprint import pprint

In [2]:
file_path = '../cat4.csv'
df = read_csv(file_path)
df.head()
y = array(df['class'])
# df = df.rename(columns={"class": "label"})
if 'cat2.csv' in file_path:
    df.drop("Unnamed: 0.1", axis=1, inplace=True)
df.drop(["Unnamed: 0", "galex_objid", "sdss_objid", "spectrometric_redshift", "pred"], axis=1, inplace=True)

In [3]:
df['class'].value_counts()

1    23389
0    10074
Name: class, dtype: int64

In [4]:
def under_sampling(df):
    min_class_len = len(df[df['class'] == 0])
    major_class_ind = df[df['class'] == 1].index
    rand_maj_ind = random.choice(major_class_ind, min_class_len, replace = False)

    min_class_ind = df[df['class'] == 0].index
    under_sample_ind = concatenate([min_class_ind,rand_maj_ind])
    under_sample = df.loc[under_sample_ind]
    
    return under_sample

under_sample = under_sampling(df)

In [5]:
def bucketize(dataframe, col_headers, bucket_size):
    assert len(col_headers) == len(bucket_size)
    no_of_columns = len(col_headers)
    for col in range(no_of_columns):
        labels = array([(x + 1) for x in range(bucket_size[col])])
        temp = cut(dataframe[col_headers[col]], bucket_size[col], labels=labels)
        dataframe.drop(col_headers[col], inplace=True, axis=1)
        dataframe[col_headers[col]] = temp
    return dataframe

# y = array(df['class'])
# df.drop('class', inplace=True, axis=1)
# temp = bucketize(df, df.columns, [7 for x in range(len(df.columns))])

y = array(under_sample['class'])
under_sample.drop('class', inplace=True, axis=1)
temp = bucketize(under_sample, under_sample.columns, [7 for x in range(len(under_sample.columns))])

In [6]:
# X = array(df)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

X = array(under_sample)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [7]:
class DecisionTree:
    
    def __init__(self, max_depth=5, min_samples=2):
        self.counter = 0
        self.max_depth = max_depth
        self.min_samples = min_samples
        
    def check_purity(self, y):
        if len(unique(y)) == 1:
            return True
        else:
            return False
        
    def classify_data(self, y):
        unique_classes, counts_unique_classes = unique(y, return_counts=True)

        index = where(counts_unique_classes == max(counts_unique_classes))[0][0]
        classification = unique_classes[index]

        return classification
    
    def get_potential_splits(self, X):
    
        potential_splits = {}
        n_columns = len(X[0])
        for column_index in range(n_columns):  
            potential_splits[column_index] = []
            values = X[:, column_index]
            unique_values = unique(values)

            for index in range(1, len(unique_values)):
                current_value = unique_values[index]
                previous_value = unique_values[index - 1]
                potential_split = (current_value + previous_value) / 2

                potential_splits[column_index].append(potential_split)

        return potential_splits
    
    def split_data(self, X, y, split_column, split_value):
    
        no_of_columns = len(X[0]) + 1
        split_column_values = X[:, split_column]
        data = append(X, y.reshape(len(X), 1), axis=1)
        data_below = data[data[:, split_column] < split_value]
        data_above = data[data[:, split_column] >= split_value]

        return data_below, data_above
    
    def calculate_entropy(self, label_column):

        _, counts = unique(label_column, return_counts=True)
        probabilities = counts / counts.sum()
        return 1 - sum(square(probabilities))
    
    def calculate_overall_entropy(self, data_below, data_above):
    
        n = len(data_below) + len(data_above)
        p_data_below = len(data_below) / n
        p_data_above = len(data_above) / n

        overall_entropy =  (p_data_below * self.calculate_entropy(data_below[:, -1]) 
                          + p_data_above * self.calculate_entropy(data_above[:, -1]))

        return overall_entropy
    
    def determine_best_split(self, X, y, potential_splits):    
        overall_entropy = inf
        for column_index in potential_splits:
            for value in potential_splits[column_index]:
                data_below, data_above = self.split_data(X, y, split_column=column_index, split_value=value)
                current_overall_entropy = self.calculate_overall_entropy(data_below, data_above)

                if current_overall_entropy <= overall_entropy:
                    overall_entropy = current_overall_entropy
                    best_split_column = column_index
                    best_split_value = value

        return best_split_column, best_split_value
    
    def build_tree(self, X, y):
        if (self.check_purity(y)) or (len(X) < self.min_samples) or (self.counter == self.max_depth):
            classification = self.classify_data(y)

            return classification


        # recursive part
        else:    
            self.counter += 1

            # helper functions 
            potential_splits = self.get_potential_splits(X)
            split_column, split_value = self.determine_best_split(X, y, potential_splits)
            data_below, data_above = self.split_data(X, y, split_column, split_value)

            question = "column_{} <= {}".format(split_column, split_value)
            sub_tree = {question: []}

            # find answers (recursion)
            yes_answer = self.build_tree(data_below[:, :-1], data_below[:, -1])
            no_answer = self.build_tree(data_above[:, :-1], data_above[:, -1])

            # If the answers are the same, then there is no point in asking the qestion.
            # This could happen when the data is classified even though it is not pure
            # yet (min_samples or max_depth base case).
            if yes_answer == no_answer:
                sub_tree = yes_answer
            else:
                sub_tree[question].append(yes_answer)
                sub_tree[question].append(no_answer)

            return sub_tree
    
    def fit(self, X, y):
        
        start_time = time.time()
        tree = self.build_tree(X, y)
        end_time = time.time()
        print("Time taken to construct the decision tree =", end_time - start_time)
        return tree
    
    def classify_example(self, example, tree):
        question = list(tree.keys())[0]
        feature_name, comparison_operator, value = question.split(" ")
        x = int(feature_name.split("_")[1])

        # ask question
        if example[x] <= float(value):
            answer = tree[question][0]
        else:
            answer = tree[question][1]

        # when at leaf node
        if type(answer) != dict:
            return answer

        # when at an internal node
        else:
            residual_tree = answer
            return self.classify_example(example, residual_tree)
    
    def predict(self, X_test, tree):
        predictions = array([])
        for example in X_test:
            predictions = append(predictions, self.classify_example(example, tree))

        return predictions

In [8]:
tree = DecisionTree(max_depth=5)

decision_tree = tree.fit(X_train, y_train)
predictions = tree.predict(X_test, decision_tree)

Time taken to construct the decision tree = 2.763608932495117


In [9]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

             precision    recall  f1-score   support

          0       0.69      0.79      0.74      3024
          1       0.76      0.64      0.69      3021

avg / total       0.72      0.72      0.72      6045

