# What we're trying:
The point of this notebook is to compare the entropy values for H(S|G|A) and for H(S|A)
or H(S|A|G) and H(S|G), i.e. how entropy of (target | attr 1 | attr 2) compares to that of (target | attr 1).

While taking the two-level conditional entropies, we followed the following algorithm:
1. Entropies of all subclass of a particular branch taken individually
2. Weighted avg entropies of all subclasses taken 
3. This weighted avg entropy becomes conditional entropy of Attr 2
4. Weighted avg entropies of conditional entropies (from step 3) of the n partitions of Attr 2 are taken
5. This is the conditional entropy of (Target | Attr 1 | Attr2)

In [43]:
import numpy as np
import pandas as pd
eps = np.finfo(float).eps
from numpy import log2 as log

In [44]:
train = pd.read_csv("./dataset/train.csv")
test = pd.read_csv("./dataset/test.csv")
train['Survived'].value_counts()

0    549
1    342
Name: Survived, dtype: int64

In [45]:
train.dropna(axis=0, inplace = True)

In [46]:
train.drop(['Cabin','Name','PassengerId','SibSp'],axis = 1, inplace=True)

In [47]:
dummies = []
cols = ['Pclass', 'Sex', 'Embarked']
for col in cols:
    dummies.append(pd.get_dummies(train[col]))

titanic_dummies = pd.concat(dummies, axis=1)

In [48]:
train = pd.concat((train,titanic_dummies), axis=1)

In [49]:
train = train.drop(['Pclass', 'Sex', 'Embarked'], axis=1)

In [50]:
train.loc[(train['Age'] <= 16), 'Age'] = 0
train.loc[(train['Age'] > 16) & (train['Age'] <= 50), 'Age'] = 1
train.loc[(train['Age'] > 50), 'Age'] = 2

In [51]:
train['Survived'].value_counts()

1    123
0     60
Name: Survived, dtype: int64

In [52]:
def entropy(value_ratios):
    e = 0
    for value in value_ratios:
        e += -(value*log(value))
    return e

# H(S)
H(S) = 0.9127341558073343

# H(S|A)
H(S|A) = 0.8822297022038299
## IG (S|A)
ig_s_age = 0.0305044536035044

# H(S|G)
H(S|G) = 0.6847741450401293
## IG (S|G)
ig_s_sex = 0.22796001076720496

# H(S|G|A) and H(S|A|G)
Turn out to be equal 
0.6157247613147919

In [53]:
sur_e = entropy(train['Survived'].value_counts(normalize=True))
print(sur_e)

0.9127341558073343


## S|A

In [54]:
num_y = len(train.loc[train['Age'] == 0])
num_a = len(train.loc[train['Age'] == 1])
num_o = len(train.loc[train['Age'] == 2])
total = len(train['Survived'])

In [55]:
y_dead = len(train.loc[(train['Age'] == 0) & (train['Survived'] == 0)])
y_alive = len(train.loc[(train['Age'] == 0) & (train['Survived'] == 1)])
print(y_dead)
print(y_alive)
print(num_y)
s_y = pd.Series([y_dead/num_y, y_alive/num_y])
s_y_e = entropy(s_y)
print(s_y_e)

2
14
16
0.5435644431995964


In [56]:
a_dead = len(train.loc[(train['Age'] == 1) & (train['Survived'] == 0)])
a_alive = len(train.loc[(train['Age'] == 1) & (train['Survived'] == 1)])
print(a_dead)
print(a_alive)
print(num_a)

s_a = pd.Series([a_dead/num_a, a_alive/num_a])
s_a_e = entropy(s_a)
print(s_a_e)

42
93
135
0.8944518845341283


In [57]:
o_dead = len(train.loc[(train['Age'] == 2) & (train['Survived'] == 0)])
o_alive = len(train.loc[(train['Age'] == 2) & (train['Survived'] == 1)])
print(o_dead)
print(o_alive)
print(num_o)
s_o = pd.Series([o_dead/num_o, o_alive/num_o])
s_o_e = entropy(s_o)
print(s_o_e)

16
16
32
1.0


In [58]:
wt_s_age = (num_y/total)*s_y_e + (num_a/total)*s_a_e + (num_o/total)*s_o_e
print(wt_s_age)

ig_s_age = sur_e - wt_s_age
print(ig_s_age)

0.8822297022038299
0.0305044536035044


## S|G

In [59]:
num_m = len(train.loc[train['male'] == 1])
num_f = len(train.loc[train['female'] == 1])

print(num_m)
print(num_f)

95
88


In [60]:
male_d = len(train.loc[(train['male'] == 1) & (train['Survived'] == 0)])
print(male_d)
male_a = len(train.loc[(train['male'] == 1) & (train['Survived'] == 1)])
print(male_a)
male_e = entropy([male_d/num_m, male_a/num_m])
print(male_e)

female_d = len(train.loc[(train['female'] == 1) & (train['Survived'] == 0)])
print(female_d)
female_a = len(train.loc[(train['female'] == 1) & (train['Survived'] == 1)])
print(female_a)
female_e = entropy([female_d/num_f, female_a/num_f])
print(female_e)


54
41
0.9864497419502243
6
82
0.3591016256485496


In [61]:
wt_sex = (num_m/total)*male_e + (num_f/total)*female_e
print(wt_sex)

0.6847741450401293


In [62]:
ig_s_sex = sur_e - wt_sex
print(ig_s_sex)

0.22796001076720496


## S|G|A

## S| G = M | A

In [63]:
num_my = len(train.loc[(train['male'] == 1) & (train['Age'] == 0)])
num_ma = len(train.loc[(train['male'] == 1) & (train['Age'] == 1)])
num_mo = len(train.loc[(train['male'] == 1) & (train['Age'] == 2)])

print(num_my)
print(num_ma)
print(num_mo)

7
68
20


In [64]:
my_dead = len(train.loc[(train['male'] == 1) & (train['Age'] == 0) & (train['Survived'] == 0)])
print(my_dead)
my_alive = len(train.loc[(train['male'] == 1) & (train['Age'] == 0) & (train['Survived'] == 1)])
print(my_alive)

my_e = entropy([my_alive/num_my])
print(my_e)



0
7
0.0


In [65]:
ma_dead = len(train.loc[(train['male'] == 1) & (train['Age'] == 1) & (train['Survived'] == 0)])
print(ma_dead)
ma_alive = len(train.loc[(train['male'] == 1) & (train['Age'] == 1) & (train['Survived'] == 1)])
print(ma_alive)

ma_e = entropy([ma_alive/num_ma, ma_dead/num_ma])
print(ma_e)



39
29
0.9843432030984678


In [66]:
mo_dead = len(train.loc[(train['male'] == 1) & (train['Age'] == 2) & (train['Survived'] == 0)])
print(mo_dead)
mo_alive = len(train.loc[(train['male'] == 1) & (train['Age'] == 2) & (train['Survived'] == 1)])
print(mo_alive)

mo_e = entropy([mo_alive/num_mo, mo_dead/num_mo])
print(mo_e)



15
5
0.8112781244591328


In [67]:
#m_age_e = entropy([my_alive/num_m, ma_alive/num_m, ma_dead/num_m, mo_alive/num_m, mo_dead/num_m])
#print(m_age_e)

In [68]:
#mo_e + my_e + ma_e

In [69]:
wt_s_m_age = (num_my/num_m)*my_e + (num_ma/num_m)*ma_e + (num_mo/num_m)*mo_e
print(wt_s_m_age)

0.8753778978934575


## S | G = F | A

In [70]:
num_fy = len(train.loc[(train['female'] == 1) & (train['Age'] == 0)])
num_fa = len(train.loc[(train['female'] == 1) & (train['Age'] == 1)])
num_fo = len(train.loc[(train['female'] == 1) & (train['Age'] == 2)])

print(num_fy)
print(num_fa)
print(num_fo)

9
67
12


In [71]:
fy_dead = len(train.loc[(train['female'] == 1) & (train['Age'] == 0) & (train['Survived'] == 0)])
print(fy_dead)
fy_alive = len(train.loc[(train['female'] == 1) & (train['Age'] == 0) & (train['Survived'] == 1)])
print(fy_alive)

fy_e = entropy([fy_alive/num_fy, fy_dead/num_fy])
print(fy_e)



2
7
0.7642045065086203


In [72]:
fa_dead = len(train.loc[(train['female'] == 1) & (train['Age'] == 1) & (train['Survived'] == 0)])
print(fa_dead)
fa_alive = len(train.loc[(train['female'] == 1) & (train['Age'] == 1) & (train['Survived'] == 1)])
print(fa_alive)

fa_e = entropy([fa_alive/num_fa, fa_dead/num_fa])
print(fa_e)



3
64
0.26377743669413856


In [73]:
fo_dead = len(train.loc[(train['female'] == 1) & (train['Age'] == 2) & (train['Survived'] == 0)])
print(fo_dead)
fo_alive = len(train.loc[(train['female'] == 1) & (train['Age'] == 2) & (train['Survived'] == 1)])
print(fo_alive)

fo_e = entropy([fo_alive/num_fo, fo_dead/num_fo])
print(fo_e)



1
11
0.41381685030363374


In [74]:
wt_s_f_age = (num_fy/num_f)*fy_e + (num_fa/num_f)*fa_e + (num_fo/num_f)*fo_e
print(wt_s_f_age)

0.3354173979628235


In [75]:
wt_s_g_age = (num_m/total)*wt_s_m_age + (num_f/total)*wt_s_f_age
print(wt_s_g_age)

0.615724761314792


## S | A | G

## S | A = Y | G = M,F

In [76]:
print(num_my)
print(num_fy)
print(num_y)

wt_y_g = (num_my/num_y)*my_e + (num_fy/num_y)*fy_e
print(wt_y_g)

7
9
16
0.4298650349110989


In [77]:
wt_a_g = (num_ma/num_a)*ma_e + (num_fa/num_a)*fa_e
print(wt_a_g)

0.6267290819940969


In [78]:
wt_o_g = (num_mo/num_o)*mo_e + (num_fo/num_o)*fo_e
print(wt_o_g)

0.6622301466508207


In [79]:
wt_age_g = (num_y/total)*wt_y_g + (num_a/total)*wt_a_g + (num_o/total)*wt_o_g
print(wt_age_g)

0.6157247613147919
