In [1]:
include("ml_lib.jl")
using DataFrames
using CSV

In [2]:
df = DataFrame(CSV.File("data/titanic_feature_engineered.csv"))
df_min_cols = df[:, [:Sex, :Pclass, :Survived]]
dropmissing!(df_min_cols);

In [3]:
function average_entropy_after_split(data_frame, dependent_feature, feature_to_split)
    feature_values = Set(data_frame[!, feature_to_split])
    total_entropy = 0
    
    for value in feature_values
        filter = data_frame[!, feature_to_split] .== value
        split = data_frame[filter, :]
        total_entropy += entropy(split[!, dependent_feature])
    end
    
    total_entropy / length(feature_values)
end

average_entropy_after_split (generic function with 1 method)

In [4]:
average_entropy_after_split(df_min_cols, "Survived", "Pclass")

0.9159238054641065

In [5]:
average_entropy_after_split(df_min_cols, :Survived, :Sex)

0.76141843152518

In [6]:
function column_to_split(data_frame, dependent_feature)
    features = delete!(Set(names(data_frame)), dependent_feature)
    feature_to_split = nothing
    least_entropy = entropy(data_frame[!, dependent_feature])
    
    for feature in features
        split_entropy = average_entropy_after_split(data_frame, dependent_feature, feature)
        
        if split_entropy < least_entropy
            least_entropy = split_entropy
            feature_to_split = feature
        end
    end
    
    feature_to_split
end

column_to_split (generic function with 1 method)

In [69]:
column_to_split(df_min_cols, "Survived")

column_to_split


"Sex"

In [8]:
function split_by(data_frame, feature)
    feature_values = Set(data_frame[!, feature])
    splits = Dict()
    
    for value in feature_values
        filter = data_frame[!, feature] .== value
        split = data_frame[filter, :]
        splits[value] = split
    end
    
    splits
end

split_by (generic function with 1 method)

In [9]:
split_by_sex = split_by(df_min_cols, "Sex")

Dict{Any, Any} with 2 entries:
  "male"   => [1m577×3 DataFrame[0m…
  "female" => [1m314×3 DataFrame[0m…

In [10]:
df_male = split_by_sex["male"]

Unnamed: 0_level_0,Sex,Pclass,Survived
Unnamed: 0_level_1,String,Int64,Int64
1,male,3,0
2,male,3,0
3,male,3,0
4,male,1,0
5,male,3,0
6,male,3,0
7,male,3,0
8,male,3,0
9,male,2,1
10,male,2,0


In [11]:
column_to_split(df_male, "Survived")

column_to_split


In [12]:
df_female = split_by_sex["female"]
column_to_split(df_female, "Survived")

column_to_split


"Pclass"

In [17]:
if column_to_split(df_male, "Survived") != nothing
    println("can be split")
else
    println("cannot be split")
end

column_to_split
cannot be split


In [65]:
function decision_tree(data_frame, dependent_feature)
    tree = Dict()
    split_column = column_to_split(data_frame, dependent_feature)
    
    if split_column == nothing
        return highest_vote(counter(data_frame[!, dependent_feature]))
    end
    
    tree[split_column] = Dict()
    for (key, val)  in split_by(data_frame, split_column)
        filter_condition = data_frame[!, split_column] .== key
        filtered_df = data_frame[filter_condition, :]
        tree[split_column][key] = decision_tree(filtered_df, dependent_feature)
    end
    
    return tree
end

decision_tree (generic function with 1 method)

In [66]:
dtree = decision_tree(df_min_cols, "Survived")

column_to_split
column_to_split
column_to_split
column_to_split
column_to_split
column_to_split


Dict{Any, Any} with 1 entry:
  "Sex" => Dict{Any, Any}("male"=>0, "female"=>Dict{Any, Any}("Pclass"=>Dict{An…

In [67]:
println(dtree)

Dict{Any, Any}("Sex" => Dict{Any, Any}("male" => 0, "female" => Dict{Any, Any}("Pclass" => Dict{Any, Any}(2 => 1, 3 => 0, 1 => 1))))


In [68]:
decision_tree(df_male, "Survived")

column_to_split


0