In [9]:
import pandas as pd
import numpy as np

In [10]:
# generate some data
data = pd.DataFrame(
    [
        ("working", "cold", 23),
        ("working", "hot", 41),
        ("weekend", "mid", 32),
        ("weekend", "cold", 10),
        ("weekend", "cold", 5),
        ("working", "cold", 4),
        ("working", "mid", 7),
        ("weekend", "hot", 17),
        ("working", "hot", 19),
        ("weekend", "mid", 61),
        ("weekend", "mid", 36),
        ("weekend", "hot", 54),
        ("weekend", "hot", 20),
        ("weekend", "cold", 24)
    ],
    columns=['day', 'temperature', 'target']
)
target = "target"
data

Unnamed: 0,day,temperature,target
0,working,cold,23
1,working,hot,41
2,weekend,mid,32
3,weekend,cold,10
4,weekend,cold,5
5,working,cold,4
6,working,mid,7
7,weekend,hot,17
8,working,hot,19
9,weekend,mid,61


In [11]:
data['day'], _ = pd.factorize(data['day'])
data['temperature'], _ = pd.factorize(data['temperature'])

In [12]:
# Calculate the standard deviation of feature values
def std(data: pd.DataFrame, feature: str, target: str = target) -> float:
    # count the number of occurrences of each value
    counts = data[feature].value_counts()
    total = counts.sum()

    # calculate the weighted standard deviation for each value
    results = []
    for value, count in counts.items():
        target_subset = data[data[feature] == value][target]
        target_std = target_subset.std(ddof=0)
        results.append(count / total * target_std if total > 0 else np.nan)

    # return the sum of the weighted standard deviations
    return np.sum(results)

# Calculate the standard deviation reduction
def sdr(data: pd.DataFrame, feature: str, target: str = target) -> float:
    return data[target].std(ddof=0) - std(data, feature, target)

# Calculate the coefficient of variation
def cv(data: pd.DataFrame, target: str = target) -> float:
    target_mean = data[target].mean()
    target_std = data[target].std(ddof=0)
    return target_std / target_mean if target_mean > 0 else np.nan

def round(x: np.ndarray, d: int = 3) -> float:
    return np.round(x, decimals=d)

print("Coefficient of variation: %.3f" % round(cv(data)))
print()

print("Standard deviation reduction:")
results = pd.DataFrame({
    feature : {
    'std': round(std(data, feature)),
    'sdr': round(sdr(data, feature))
    } for feature in ['day', 'temperature']
}).T.sort_values('sdr', ascending=False)
results

Coefficient of variation: 0.677

Standard deviation reduction:


Unnamed: 0,std,sdr
temperature,13.827,3.249
day,16.235,0.842
