In [1]:
using DataFrames
using Queryverse
using Statistics
using StatsBase
using HypothesisTests

In [2]:
data = load("/home/chigball/Data/houseprice.csv") |>
@mutate(WhiteMarble = string(_.WhiteMarble), Floors = string(_.Floors)) |> DataFrame;
describe(data) |> @select(:variable, :eltype) |> DataFrame


Unnamed: 0_level_0,variable,eltype
Unnamed: 0_level_1,Symbol,DataType
1,Area,Int64
2,Garage,Int64
3,FirePlace,Int64
4,Baths,Int64
5,WhiteMarble,String
6,BlackMarble,Int64
7,IndianMarble,Int64
8,Floors,String
9,City,Int64
10,Solar,Int64


In [3]:
first(data, 5)

Unnamed: 0_level_0,Area,Garage,FirePlace,Baths,WhiteMarble,BlackMarble,IndianMarble,Floors
Unnamed: 0_level_1,Int64,Int64,Int64,Int64,String,Int64,Int64,String
1,164,2,0,2,0,1,0,0
2,84,2,0,4,0,0,1,1
3,190,2,4,4,1,0,0,0
4,75,2,4,4,0,0,1,1
5,148,1,4,2,1,0,0,1


In [4]:
# Function for cat/con
# The test for such combo is "KruskalWallis"
function gamma(cat_var::Vector{String}, con_var::Vector{<: Number})
  data = DataFrame(levels = cat_var, values = con_var)
  levels = data.levels |> Set |> collect
  vec_list = Vector{Float64}[]
  for i in levels
    filtered_df = data[data.levels .== i, :]
    filtered_vec = filtered_df.values
    push!(vec_list, filtered_vec)  
  end
  p = KruskalWallisTest(vec_list...) |> pvalue  
  p = round(p, digits  = 3)
  return (p, "KruskalWallis")
end

# Function for con/cat
# The test for such combo is "KruskalWallis"
function gamma(con_var::Vector{<: Number}, cat_var::Vector{String})
  gamma(cat_var, con_var)
end

# Function for cat/cat
# The test for such combo is "ChiSquared"
function gamma(cat_var1::Vector{String}, cat_var2::Vector{String})
  data = DataFrame(levels1 = cat_var1, levels2 = cat_var2)
  ul1 = data.levels1 |> Set |> collect |> sort
  ul2 = data.levels2 |> Set |> collect |> sort
  output = zeros(Int64, length(ul1), length(ul2))
  for i in eachindex(ul1), j in eachindex(ul2)
    tv = (data.levels1 .== ul1[i]) .& (data.levels2 .== ul2[j])
    output[i,j] = sum(tv)  
  end
  p = ChisqTest(output) |> pvalue
  return (p, "ChiSquared")
end


# Function for con/con
function gamma(con_var1::Vector{<: Number}, con_var2::Vector{<: Number})
  th = 20 # Treshold for if vector should be category or not
  
  # Get the # of levels for each vector
  l1 = con_var1 |> Set |> length
  l2 = con_var2 |> Set |> length
  
  # Get the string vector of each vector
  con_var1_st = string.(con_var1)
  con_var2_st = string.(con_var2)
  
  # Treat vectors with low number of levels as categories and apply the right test
  if l1 <= th && l2 <= th
    gamma(con_var1_st, con_var1_st) # Apply ChiSquared test
  elseif l1 <= th && l2 > th
    gamma(con_var1_st, con_var2)  # Apply "KruskalWallis" test
  elseif l1 > th && l2 <= th
    gamma(con_var1, con_var2_st) # Apply "KruskalWallis" test
  else
    p = CorrelationTest(con_var1, con_var2) |> pvalue # Apply "Correlation" test
    return (p, "Correlation")
  end
end

# Finds the most important variables against the target
function variable_importance(data::DataFrame, target::Symbol)
  X = select(data, Not(target)) # Get DataFrame X of x variables
  y = data[!, target] # Obtain y vector
  col_names = names(X) # Name vector of X dataframe
  output = DataFrame(feature = String[], test = String[], pvalue = Float64[])
  for i in col_names
    x = X[!, i]
    J = gamma(x, y)
    push!(output, [i, J[2], J[1]])
  end
  tv = (output.pvalue .< 0.05)
  results = output[tv, :]
  return results
end

variable_importance (generic function with 1 method)

In [5]:
white_marble = data.WhiteMarble # Catogory Data 1
floors = data.Floors # Catogory Data 2
prices = data.Prices # Continous Data 1
area = data.Area # Continous Data 2
baths = data.Baths # Unconverted ints
fiber = data.Fiber # Unconverted ints
gamma(baths, fiber)

(0.0, "ChiSquared")

In [9]:
variable_importance(data, :Prices)

Unnamed: 0_level_0,feature,test,pvalue
Unnamed: 0_level_1,String,String,Float64
1,Area,Correlation,0.0
2,Garage,KruskalWallis,0.0
3,FirePlace,KruskalWallis,0.0
4,Baths,KruskalWallis,0.0
5,WhiteMarble,KruskalWallis,0.0
6,BlackMarble,KruskalWallis,0.0
7,IndianMarble,KruskalWallis,0.0
8,Floors,KruskalWallis,0.0
9,City,KruskalWallis,0.0
10,Solar,KruskalWallis,0.0


In [10]:
data2 = load("/home/chigball/Data/train.csv") |> @select(2,3,5,7,8,12) |> @filter(_.Embarked != "") |> DataFrame
first(data2, 8)


Unnamed: 0_level_0,Survived,Pclass,Sex,SibSp,Parch,Embarked
Unnamed: 0_level_1,Int64,Int64,String,Int64,Int64,String
1,0,3,male,1,0,S
2,1,1,female,1,0,C
3,1,3,female,0,0,S
4,1,1,female,1,0,S
5,0,3,male,0,0,S
6,0,3,male,0,0,Q
7,0,1,male,0,0,S
8,0,3,male,3,1,S


In [11]:
variable_importance(data2, :Survived)

Unnamed: 0_level_0,feature,test,pvalue
Unnamed: 0_level_1,String,String,Float64
1,Pclass,ChiSquared,0.0
2,Sex,KruskalWallis,0.0
3,SibSp,ChiSquared,0.0
4,Parch,ChiSquared,0.0
5,Embarked,KruskalWallis,0.0


In [12]:
    df = DataFrame(
    A = [1, 2, 3, 4, 5, 6], 
    B = ["a", "b", "a", "b", "a", "b"],
    C = ["c", "d", "c", "d", "c", "c"],
    T = [.17, .23, .36, .46, .58, .67],
    )

Unnamed: 0_level_0,A,B,C,T
Unnamed: 0_level_1,Int64,String,String,Float64
1,1,a,c,0.17
2,2,b,d,0.23
3,3,a,c,0.36
4,4,b,d,0.46
5,5,a,c,0.58
6,6,b,c,0.67


In [16]:
names(variable_importance(df, :T))

└ @ HypothesisTests /home/chigball/.julia/packages/HypothesisTests/V7PST/src/kruskal_wallis.jl:73
└ @ HypothesisTests /home/chigball/.julia/packages/HypothesisTests/V7PST/src/kruskal_wallis.jl:73


3-element Vector{String}:
 "feature"
 "test"
 "pvalue"