## 1.1. Updating Probablities using Olafson Paper

In [15]:
import pandas as pd
import numpy as np
from scipy.stats import bernoulli

In [2]:
df = pd.read_excel("hypothetical data.xlsx",sheet_name="static")
m = df.shape[0]
df.head()

Unnamed: 0,y,x1,x2,x3,x4,c
0,154,8,20,4,42,2
1,196,9,22,5,49,2
2,198,10,19,4,57,1
3,178,9,19,3,47,2
4,157,9,18,4,65,2


In [22]:
import math
c = len(df.c.unique())

def func_gain(df,T_Independent_Variables):
    features_list = list(T_Independent_Variables.columns)
    i_list = list(df['c'].unique())
    s = {}
    S = {}
    p = {}
    p_IT = {}
    q = {}
    I = {}
    E = {}
    Gain = {}
    ProbVector = {}
    # calculate s, S
    for a in features_list:
        for j in T_Independent_Variables[a].unique():
            S[(j,a)] = 0
            q[j,a] = 0
            for i in i_list:
                filtered_df = df.loc[(df[a] == j) & (df['c'] == i)]
                count = filtered_df[a].count()
                s[(i,j,a)] = count
                S[(j,a)] += count

            q[j,a] = S[j,a]/m
    # print("q",q)
    # calculate p, I
    for a in features_list:
        for j in T_Independent_Variables[a].unique():
            I[j,a] = 0
            for i in i_list:
                p[i,j,a] = s[i,j,a]/S[j,a]
                if p[i,j,a]>0:
                    I[(j,a)] += (-1) * p[i,j,a] * math.log2(p[i,j,a])

    # calculate IT
    IT = 0
    for i in i_list:
        for a in features_list:
            p_IT[i,a] = 0
            for j in T_Independent_Variables[a].unique():
                p_IT[i,a] += s[i,j,a]/m
            IT += (-1) * p_IT[i,a] * math.log2(p_IT[i,a])

    # calculate E
    for a in features_list:
        E[a] = 0
        ProbVector[a] = 0
        for j in T_Independent_Variables[a].unique():
            E[a] += q[j,a] * I[j,a]
        Gain[a] = IT - E[a]
    # sorting
    sorted_E = {k: v for k, v in sorted(E.items(), key=lambda item: item[1])}
    total_gain = sum(Gain.values())  # Step 1

    for key in Gain:  # Step 2
        ProbVector[key] = Gain[key] / total_gain

    #print("ProbVector", ProbVector)
    return ProbVector




In [12]:
T_Independent_Variables = df.drop(['y', 'c'], axis=1)

features_list = list(T_Independent_Variables.columns)

In [13]:
func_gain(df,T_Independent_Variables)

ProbVector {'x1': 0.24385377919392037, 'x2': 0.2459310065543047, 'x3': 0.24188652834457627, 'x4': 0.26832868590719855}


{'x1': 0.24385377919392037,
 'x2': 0.2459310065543047,
 'x3': 0.24188652834457627,
 'x4': 0.26832868590719855}

## 1.2. Updating Probablities using Greedy method

In [None]:
def mean_squared_error(y, x1, x2, x3, x4):
    y_pred = x1*x2
    loss = np.mean((y - y_pred)**2)
    return loss

def loss_calculator(df,a):
    df['Loss'] = df.apply(lambda row: mean_squared_error(row['y'], a[0]*row['x1'], a[1]*row['x2'], a[2]*row['x3'], a[3]*row['x4']), axis=1)
    return df

def performance_calculator(sampled_df,a,i):
    a[i] = 0
    loss_calculator(sampled_df,a)
    mean = sampled_df['Loss'].mean()
    a[i] = 1
    return mean

## 2. Generating Solutions

In [46]:
def solution_generator(n,ProbVector,excluded_features):

    solution_list = []
    for i in range(0,n):
        solution = {}
        for a in features_list:
            if a in excluded_features:
                solution[a] = 0
            else:
                solution[a] = bernoulli(ProbVector[a]).rvs()
        #print(solution)
        solution_list.append(solution)

    return solution_list

In [47]:
ProbVector = func_gain(df,T_Independent_Variables)

solution_generator(10,ProbVector,['x1'])

[{'x1': 0, 'x2': 0, 'x3': 0, 'x4': 1},
 {'x1': 0, 'x2': 1, 'x3': 1, 'x4': 1},
 {'x1': 0, 'x2': 1, 'x3': 1, 'x4': 1},
 {'x1': 0, 'x2': 0, 'x3': 1, 'x4': 0},
 {'x1': 0, 'x2': 1, 'x3': 1, 'x4': 0},
 {'x1': 0, 'x2': 0, 'x3': 0, 'x4': 1},
 {'x1': 0, 'x2': 0, 'x3': 0, 'x4': 0},
 {'x1': 0, 'x2': 0, 'x3': 1, 'x4': 1},
 {'x1': 0, 'x2': 1, 'x3': 0, 'x4': 1},
 {'x1': 0, 'x2': 0, 'x3': 0, 'x4': 0}]