In [30]:
import math

In [31]:
# Example dataset
n_A = 4 
n_B = 6
total = n_A + n_B

In [32]:
# let's calculate the proportions
prop_A = n_A / total
prop_B = n_B / total

# print proportions
print(f"Proportion of A: {prop_A}")
print(f"Proportion of B: {prop_B}")

Proportion of A: 0.4
Proportion of B: 0.6


In [33]:
# Entorpy calculation
def entropy(proportions):
    ent = 0
    for p in proportions:
        if p > 0:
            ent -= p * math.log2(p)
    return ent

proportions = [prop_A, prop_B]
ent = entropy(proportions)
print(f"Entropy of the dataset: {ent}")

Entropy of the dataset: 0.9709505944546686


In [34]:
# Gini Impurity calculation
def gini_impurity(proportions):
    gini = 1
    for p in proportions:
        gini -= p ** 2
    return gini 
gini = gini_impurity(proportions)
print(f"Gini Impurity of the dataset: {gini}")

Gini Impurity of the dataset: 0.48


In [35]:
# infromation gain calculation
def information_gain(parent_entropy, left_entropy, right_entropy, left_size, right_size):
    total_size = left_size + right_size
    weighted_child_entropy = (left_size / total_size) * left_entropy + (right_size / total_size) * right_entropy
    return parent_entropy - weighted_child_entropy  
parent_entropy = entropy(proportions)
left_proportions = [3/5, 2/5]
right_proportions = [1/5, 4/5]
left_entropy = entropy(left_proportions)
right_entropy = entropy(right_proportions)
left_size = 5
right_size = 5
info_gain = information_gain(parent_entropy, left_entropy, right_entropy, left_size, right_size)
print(f"Information Gain from the split: {info_gain}")

Information Gain from the split: 0.12451124978365313


# **`Decision Tree for Classification`**

In [36]:
# import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

In [37]:
# load dataset
df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [38]:
# drop the deck columns
df.drop('deck', axis=1, inplace=True)

# impute missing values of age and fare using medain
imputer = SimpleImputer(strategy='median')
df[['age', 'fare']] = imputer.fit_transform(df[['age', 'fare']])

# impute missing values of embarked and embark_town using mode
imputer_mode = SimpleImputer(strategy='most_frequent')
df[['embarked', 'embark_town']] = imputer_mode.fit_transform(df[['embarked', 'embark_town']])

In [39]:
df.isnull().sum()

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
embark_town    0
alive          0
alone          0
dtype: int64

In [None]:
# Encode the categorical variables using for loop leabel encoding
le = LabelEncoder()
for col in df.select_dtypes(include=['object', 'category']):
    df[col] = le.fit_transform(df[col])

        