In [35]:
from pandas import read_csv
from numpy import array, unique, log2, inf, append
from sklearn.model_selection import train_test_split
import random
from pprint import pprint

In [36]:
file_path = '../catalog1/cat1.csv'
df = read_csv(file_path)
df.head()
y = array(df['class'])
# df = df.rename(columns={"class": "label"})
if 'cat2.csv' in file_path:
    df.drop("Unnamed: 0.1", axis=1, inplace=True)
df.drop(["Unnamed: 0", "galex_objid", "sdss_objid", "class", "spectrometric_redshift", "pred"], axis=1, inplace=True)

In [37]:
X = array(df)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [38]:
def check_purity(y):
    if len(unique(y)) == 1:
        return True
    else:
        return False

In [39]:
def classify_data(y):
    unique_classes, counts_unique_classes = unique(y, return_counts=True)

    index = counts_unique_classes.index(max(counts_unique_classes))
    classification = unique_classes[index]
    
    return classification

In [40]:
def get_potential_splits(X, y):
    
    potential_splits = {}
    #n_columns = len(X[0])
    for column_index in range(len(X[0])):  
        potential_splits[column_index] = []
        values = X[:, column_index]
        unique_values = unique(values)

        for index in range(len(unique_values)):
            if index != 0:
                current_value = unique_values[index]
                previous_value = unique_values[index - 1]
                potential_split = (current_value + previous_value) / 2
                
                potential_splits[column_index].append(potential_split)
    
    return potential_splits

In [41]:
def split_data(X, y, split_column, split_value):
    
    split_column_values = X[:, split_column]
    data_below = data_above = array([])
#     print("len(X) in function split_data:", len(X))
    for index in range(len(X) - 1):
        if split_column_values[index] <= split_value:
            data_below = append(data_below, append(X[index], y[index]))
        else:
            data_above = append(data_above, append(X[index], y[index]))
    
    return data_below, data_above

In [47]:
def calculate_entropy(data):
    
    print(data)
    label_column = data[:, -1]
    _, counts = unique(label_column, return_counts=True)

    probabilities = counts / counts.sum()
    entropy = sum(probabilities * -log2(probabilities))
     
    return entropy

In [43]:
def calculate_overall_entropy(data_below, data_above):
    
    n = len(data_below) + len(data_above)
    p_data_below = len(data_below) / n
    p_data_above = len(data_above) / n

    overall_entropy =  (p_data_below * calculate_entropy(data_below) 
                      + p_data_above * calculate_entropy(data_above))
    
    return overall_entropy

In [44]:
def determine_best_split(X, y, potential_splits):    
    overall_entropy = inf
    for column_index in potential_splits:
        for value in potential_splits[column_index]:
            data_below, data_above = split_data(X, y, split_column=column_index, split_value=value)
            current_overall_entropy = calculate_overall_entropy(data_below, data_above)

            if current_overall_entropy <= overall_entropy:
                overall_entropy = current_overall_entropy
                best_split_column = column_index
                best_split_value = value
    
    return best_split_column, best_split_value

In [45]:
def decision_tree_algorithm(X, y, column_headers, max_depth, counter=0, min_samples=2):
    
#     # data preparations
#     if counter == 0:
#         global COLUMN_HEADERS
#         COLUMN_HEADERS = df.columns
#         data = df.values
#     else:
#         data = df   
        
    if (check_purity(y)) or (len(X) < min_samples) or (counter == max_depth):
        classification = classify_data(y)
        
        return classification

    
    # recursive part
    else:    
        counter += 1

        # helper functions 
        potential_splits = get_potential_splits(X, y)
        split_column, split_value = determine_best_split(X, y, potential_splits)
        data_below, data_above = split_data(X, y, split_column, split_value)
    
        feature_name = column_headers[split_column]
        question = "{} <= {}".format(feature_name, split_value)
        sub_tree = {question: []}
        
        # find answers (recursion)
        yes_answer = decision_tree_algorithm(data_below, counter, min_samples, max_depth)
        no_answer = decision_tree_algorithm(data_above, counter, min_samples, max_depth)
        
        # If the answers are the same, then there is no point in asking the qestion.
        # This could happen when the data is classified even though it is not pure
        # yet (min_samples or max_depth base case).
        if yes_answer == no_answer:
            sub_tree = yes_answer
        else:
            sub_tree[question].append(yes_answer)
            sub_tree[question].append(no_answer)
        
        return sub_tree

In [48]:
tree = decision_tree_algorithm(X_train, y_train, column_headers=df.columns, max_depth=3)
pprint(tree)

len(X) in function split_data: 454
[15.84910011 15.63408089 16.076437   16.29909706 16.52137756  0.09988927
  0.07783304  0.05384454  0.04001226  0.02976177 16.00978088 16.12462044
  0.16068077  0.3757     -0.06665611 -0.28931618 -0.51159668  0.21501923
 -0.22733688 -0.44999695 -0.67227745 -0.44235611 -0.66501617 -0.88729668
 -0.22266006 -0.44494057 -0.2222805  -0.11483955 -0.27552032 -0.49053955
 -0.04818344  0.17447662  0.39675713  0.        ]


IndexError: too many indices for array

In [34]:
X_train.shape, y_train.shape

((454, 33), (195, 33))

In [15]:
def classify_example(example, tree):
    question = list(tree.keys())[0]
    feature_name, comparison_operator, value = question.split(" ")

    # ask question
    if example[feature_name] <= float(value):
        answer = tree[question][0]
    else:
        answer = tree[question][1]

    # base case
    if not isinstance(answer, dict):
        return answer
    
    # recursive part
    else:
        residual_tree = answer
        return classify_example(example, residual_tree)

In [16]:
example = test.iloc[0]
print(example)
classify_example(example, tree)

u               18.536404
g               18.440216
r               18.271149
i               18.369190
z               18.420809
extinction_u     0.075586
extinction_g     0.058896
extinction_r     0.040744
extinction_i     0.030277
extinction_z     0.022521
nuv_mag         19.023506
fuv_mag         19.976627
label            1.000000
nuv-u            0.487103
nuv-g            0.583290
nuv-r            0.752357
nuv-i            0.654316
nuv-z            0.602697
u-g              0.096188
u-r              0.265255
u-i              0.167213
u-z              0.115595
g-r              0.169067
g-i              0.071026
g-z              0.019407
r-i             -0.098042
r-z             -0.149660
i-z             -0.051619
fuv-nuv         -0.953121
fuv-u           -1.440224
fuv-g           -1.536411
fuv-r           -1.705479
fuv-i           -1.607437
fuv-z           -1.555819
Name: 636, dtype: float64


-1.9835529327393

In [17]:
def calculate_accuracy(df, tree):

    df["classification"] = df.apply(classify_example, axis=1, args=(tree,))
    df["classification_correct"] = df["classification"] == df["label"]
    
    accuracy = df["classification_correct"].mean()
    
    return accuracy

In [19]:
accuracy = calculate_accuracy(test, tree)
accuracy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


0.0

In [7]:
454 * 33

14982