In [1]:
from pandas import read_csv, cut, DataFrame, concat
from numpy import array, unique, log2, inf, append, where, square
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import random
from pprint import pprint
import time

In [2]:
file_path = '../catalog2/cat2.csv'
df = read_csv(file_path)
if 'cat2.csv' in file_path:
    df.drop("Unnamed: 0.1", axis=1, inplace=True)
df.drop(["Unnamed: 0", "galex_objid", "sdss_objid", "spectrometric_redshift", "pred"], axis=1, inplace=True)

In [None]:
def resample(df, n_samples, replace=True):
    temp = DataFrame()
    length_df = len(df)
    base_size = 10
    if replace == False:
        assert n_samples <= length_df
        return df.sample(n=n_samples)
    else:
        while n_samples > base_size:
            temp = temp.append(df.sample(n=base_size), ignore_index=True)
            n_samples -= 10
        temp = temp.append(df.sample(n=n_samples))
    return temp

df_0 = df[df['class'] == 0]
df_1 = df[df['class'] == 1]
new_df_1 = resample(df=df_1, n_samples=2500, replace=False)
df = shuffle(df_0.append(new_df_1))

In [3]:
y = array(df['class'])
df.drop('class', inplace=True, axis=1)

In [4]:
def bucketize(dataframe, col_headers, bucket_size):
    assert len(col_headers) == len(bucket_size)
    no_of_columns = len(col_headers)
    for col in range(no_of_columns):
        labels = array([(x + 1) for x in range(bucket_size[col])])
        temp = cut(dataframe[col_headers[col]], bucket_size[col], labels=labels)
        dataframe.drop(col_headers[col], inplace=True, axis=1)
        dataframe[col_headers[col]] = temp
    return dataframe

In [5]:
temp = bucketize(df, df.columns, [7 for x in range(len(df.columns))])
temp.head(10)

Unnamed: 0,u,g,r,i,z,extinction_u,extinction_g,extinction_r,extinction_i,extinction_z,...,g-z,r-i,r-z,i-z,fuv-nuv,fuv-u,fuv-g,fuv-r,fuv-i,fuv-z
0,3,4,5,5,5,1,1,1,1,1,...,3,2,3,5,3,3,5,5,5,5
1,4,5,5,5,4,1,1,1,1,1,...,3,3,4,5,3,4,5,5,4,4
2,3,4,5,5,4,1,1,1,1,1,...,3,3,4,6,2,3,4,4,4,4
3,6,6,5,5,3,1,1,1,1,1,...,6,5,5,7,4,5,5,4,3,2
4,5,5,4,4,3,1,1,1,1,1,...,5,4,4,6,4,5,4,3,3,3
5,3,3,3,2,2,1,1,1,1,1,...,4,3,4,6,3,3,4,3,3,3
6,5,6,6,4,3,1,1,1,1,1,...,6,6,6,7,3,4,5,4,2,1
7,6,6,7,7,5,1,1,1,1,1,...,3,3,4,6,4,5,5,5,5,4
8,6,6,7,7,5,1,1,1,1,1,...,3,3,4,6,4,5,5,5,5,4
9,3,4,5,5,4,1,1,1,1,1,...,3,3,4,5,3,4,5,5,5,4


In [6]:
X = array(df)
rows = append(X, y.reshape(len(X), 1), axis=1)
train, test = train_test_split(rows, test_size=0.2, random_state=42)

In [7]:
def check_purity(y):
    if len(unique(y)) == 1:
        return True
    else:
        return False

In [8]:
def classify_data(y):
    unique_classes, counts_unique_classes = unique(y, return_counts=True)

    index = where(counts_unique_classes == max(counts_unique_classes))[0][0]
    classification = unique_classes[index]
    
    return classification

In [None]:
def get_potential_splits(X, y):
    
    potential_splits = {}
    #n_columns = len(X[0])
    for column_index in range(len(X[0])):  
        potential_splits[column_index] = set()
        values = X[:, column_index]
        unique_values = unique(values)

        for index in range(1, len(unique_values)):
            current_value = unique_values[index]
            previous_value = unique_values[index - 1]
            potential_split = (current_value + previous_value) / 2

            potential_splits[column_index].add(potential_split)
    
    return potential_splits

In [29]:
def split_data(data, split_column, split_value):
    
    # calculate the number of columns in the data
    no_of_columns = len(data[0])
    split_column_values = data[:, split_column]
    
    data_below = data[data[:, split_column] < split_value]
    data_above = data[data[:, split_column] >= split_value]
    
#     data_below = data_above = array([]).reshape(0, no_of_columns)
#     for index in range(len(data)):
#         temp = data[index].reshape(1, no_of_columns)
#         if split_column_values[index] <= split_value:
#             data_below = append(data_below, temp, axis=0)
#         else:
#             data_above = append(data_above, temp, axis=0)
    
    return data_below, data_above

In [10]:
def calculate_gini(data):
    
    label_column = data[:, -1]
    _, counts = unique(label_column, return_counts=True)

    probabilities = counts / counts.sum()

    return 1 - sum(square(probabilities))

In [27]:
def calculate_overall_gini(data_below, data_above, current_uncertainty):
    
    p = float(len(data_below)) / (len(data_below) + len(data_above))
    return current_uncertainty - p * calculate_gini(data_below) - (1 - p) * calculate_gini(data_above)

In [None]:
def determine_best_split(X, y, potential_splits):    
    overall_entropy = inf
    for column_index in potential_splits:
        for value in potential_splits[column_index]:
            data_below, data_above = split_data(X, y, split_column=column_index, split_value=value)
            current_overall_entropy = calculate_overall_entropy(data_below, data_above)

            if current_overall_entropy <= overall_entropy:
                overall_entropy = current_overall_entropy
                best_split_column = column_index
                best_split_value = value
        print("Scanning through column {} over".format(column_index))
    
    return best_split_column, best_split_value

In [21]:
def decision_tree_algorithm(data, max_depth, counter=0, min_samples=2):

    if (check_purity(data[:, -1])) or (len(data) < min_samples) or (counter == max_depth):
        classification = classify_data(data[:, -1])
        
        return classification

    
    # recursive part
    else:    
        # counter indicates the depth of the decision tree
        counter += 1
        print(counter)
        
        # calculate the number of independent columns in the dataset
        no_of_columns = len(data[0]) - 1
        
        # initializations required for looping
        split_col = split_val = None
        best_gain = 0
        current_uncertainty = calculate_gini(data)
        
        # iterate through all the columns in the data
        for col in range(no_of_columns):
            
            # find the unique values of a column
            values = unique(data[:, col])
            # iterate through all the unique values
            for val in values:
                
                # find the split of the data
                data_below, data_above = split_data(data, col, val)
                
                # calculate the gini index of the current split
                gain = calculate_overall_gini(data_below, data_above, current_uncertainty)
                
                # if gain > best_gain, then replace the best params
                if gain > best_gain:
                    best_gain = gain
                    split_col = col
                    split_val = val
                print(gain)
        
        # once best params are found, then perform the final split at this "counter"
        data_below, data_above = split_data(data, split_col, split_val)
        
        # formulate the question for the current level and store it in the tree
        question = "column_{} <= {}".format(split_col, split_val)
        sub_tree = {question: []}
        
        # go another level down and split the data_below and data_above
        yes_answer = decision_tree_algorithm(data_below, max_depth, counter, min_samples)
        no_answer = decision_tree_algorithm(data_above, max_depth, counter, min_samples)
        
        # If the answers are the same, then there is no point in asking the qestion.
        # This could happen when the data is classified even though it is not pure
        # yet (min_samples or max_depth base case).
        if yes_answer == no_answer:
            sub_tree = yes_answer
        else:
            sub_tree[question].append(yes_answer)
            sub_tree[question].append(no_answer)
        
        return sub_tree

In [None]:
def find_best_split(rows):
    best_gain = 0  # keep track of the best information gain
    best_col = best_val = None  # keep train of the feature / value that produced it
    current_uncertainty = calculate_gini(rows)
    n_features = len(rows[0]) - 1  # number of columns

    for col in range(n_features):  # for each feature

        values = unique(label_column)  # unique values in the column

        for val in values:  # for each value

            # try splitting the dataset
            data_below, data_above = split_data(rows, col, val)

            # Skip this split if it doesn't divide the
            # dataset.
            if len(data_below) == 0 or len(data_above) == 0:
                continue

            # Calculate the information gain from this split
            gain = calculate_overall_gini(data_below, data_above, current_uncertainty)

            if gain >= best_gain:
                best_gain = gain
                best_col = col
                best_val = val

    return best_gain, best_question

In [None]:
def build_tree(rows):
    """Builds the tree.

    Rules of recursion: 1) Believe that it works. 2) Start by checking
    for the base case (no further information gain). 3) Prepare for
    giant stack traces.
    """

    # Try partitioing the dataset on each of the unique attribute,
    # calculate the information gain,
    # and return the question that produces the highest gain.
    split_col, split_val = find_best_split(rows)

#     # Base case: no further info gain
#     # Since we can ask no further questions,
#     # we'll return a leaf.
#     if gain == 0:
        
    data_below, data_above = split_data(rows, split_col, split_val)
    sub_tree = {question: []}
        
    # find answers (recursion)
    yes_answer = build_tree(data_below)
    no_answer = build_tree(data_above)

    # If the answers are the same, then there is no point in asking the qestion.
    # This could happen when the data is classified even though it is not pure
    # yet (min_samples or max_depth base case).
    if yes_answer == no_answer:
        sub_tree = yes_answer
    else:
        sub_tree[question].append(yes_answer)
        sub_tree[question].append(no_answer)

    return sub_tree

In [30]:
start_time = time.time()
tree = decision_tree_algorithm(rows, 3)
end_time = time.time()
pprint(tree)
print("Time taken to construct the decision tree =", end_time - start_time)

1
0.0
0.003026753517271691
0.00468388810466841
5.2810494277910625e-06
0.011063176736241152
0.0029364025540601166
0.002668286603210751
0.0
0.00277453300752023
0.00421130124741001
0.0038878344640284057
9.405205510626402e-06
0.00391703798387931
7.896366121789509e-05
0.0
0.0009821179111691447
0.0034086197688130704
0.0030712853467912415
4.418324870969492e-07
0.0021954541927015347
0.005835226746122786
0.0
0.0009821179111691447
0.00214065019516102
0.0026763271463139116
0.000541109339366963
0.004362657636639483
0.01088967628373989
0.0
0.0010254331516587056
0.0022693186011082667
0.0026518307611012037
0.007439675700163989
0.012065512233011404
0.0021781501543890604
0.0
1.1881461487096845e-05
0.0006540253782643113
0.000860962794682868
0.00043036329544912655
0.0
1.1881461487096845e-05
0.0006540253782643113
0.000860962794682868
0.00043036329544912655
0.0
1.1881461487096845e-05
0.0006540253782643113
0.000860962794682868
0.00043036329544912655
0.0
1.1881461487096845e-05
0.0006540253782643113
0.0008609

In [31]:
def classify_example(example, tree):
    question = list(tree.keys())[0]
    feature_name, comparison_operator, value = question.split(" ")
    x = int(feature_name.split("_")[1])
    
    # ask question
    if example[x] <= float(value):
        answer = tree[question][0]
    else:
        answer = tree[question][1]

    # base case
    if not isinstance(answer, dict):
        return int(answer)
    
    # recursive part
    else:
        residual_tree = answer
        return classify_example(example, residual_tree)

In [32]:
def predict(X_test, tree):
    predictions = array([])
    for example in X_test:
        predictions = append(predictions, classify_example(example, tree))
    
    return predictions

In [33]:
X_test, y_test = test[:, :-1], test[:, -1]
predictions = predict(X_test, tree)

from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

ValueError: Classification metrics can't handle a mix of unknown and binary targets

In [34]:
print(y_test, predictions)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1 0
 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 0 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 0 1 1 1 1 1 1 1 1
 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 0 0 1 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1
 1 1 1 1 0 1 1 1 1 0 1 1 1 0 1 1 1 1 1 0 1 0 1 1 1 1 0 1 1 1 0 0 1 1 1 1 1
 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1
 1 1 1 1 1 0 1 0 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 0 1 1 1 0 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 0 1 1 0 1 1 0 1 1 0 1 1 1 0 1 0 1 1 1 0 1 1 1 1 1 1 1 1 1
 0 1 1 1 1 1 0 1 1 1 0 0 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 0 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 1 1 1 1 1 1 1 1 1
 1 1 0 1 1 1 1 1 0 1 1 1 

In [None]:
len(y_test), len(predictions)

In [None]:
X

In [None]:
y

In [None]:
append(X, y.reshape(len(X), 1), axis=1)

In [None]:
len(X[0])

In [None]:
random_array = array([random.randrange(1, 7) for x in range(len(X[0]))])

In [None]:
append(X, random_array.reshape(1, 33), axis=0)

In [None]:
data_below = X[X[:, 3] > 3]

In [None]:
for i in range(10):
    print(data_below[i])