<h3> Mututal Information Coding Practice

Code for the mutual information of two discrete variables as defined by wikipedia:
    https://en.wikipedia.org/wiki/Mutual_information

In [41]:
import pandas as pd
import numpy as np
import itertools

In [92]:
def pointwise_mi(df,a_val,b_val):
    '''
    generate the point-wise mutual information for a specific outcome of a and b
    df: the data_frame, currently expecting columns of 'a' and 'b', which have
    no missing values
    a_val: the specific outcome of 'a' tested (e.g., 1 or 0 for a boolean outcome)
    b_val: the specific outcome of 'b' tested (e.g., 1 or 0 for a boolean outcome)
    
    this function only works for discrete outcomes
    '''
    
    tot = df.a.count() * 1.0 #get the total number of rows, turn into float
    a_cnts = df.a.value_counts()[a_val]
    b_cnts = df.b.value_counts()[b_val]
    
    p_a = a_cnts / tot #marginal probability of the outcome a
    p_b = b_cnts / tot
    
    tot_a_b = np.sum(df.apply(both_vals,axis = 1, args = (a_val,b_val))) #the total number ...
    # of joint outcomes with of a_val and b_val
    
    p_a_b =  tot_a_b / tot
    
    p_mi = np.log(p_a_b / (p_a * p_b))
    return p_mi, p_a_b
    

In [30]:
def both_vals(x,a_val,b_val):
    '''
    check to see whether a is equal to the a_val and b is equal to the b_val
    x: the data frame with the a and b columns
    a_val: the outcome a to check
    b_va: the outcome b to check
    
    '''
    if x.a == a_val and x.b == b_val:
        return 1
    else:
        return 0

In [45]:
def mututal_information(df):
    '''
    returns the mutual information of two probabilistic outcomes
    df: a data frame with two columns, 'a' and 'b', which contain any number of different, 
    discrete outcomes. Outcomes can be integers, strings, etc, but are assumed to be discrete
    '''
    
    a_set = set(df.a.unique()) #the set of values in a
    b_set = set(df.b.unique())
    permutations = list(itertools.product(a_set,b_set)) #cartesian product of a_set and b_set
    mi_array = list() #the mutual information for each unique combination of a and b

    for val in permutations:
        pmi, p_a_b = pointwise_mi(df,val[0],val[1]) #get the pointwise mututual information
        mi_array.append(p_a_b * pmi)
        
    mi = np.sum(np.array(mi_array)) #mutual information
    
    return mi
    
    

In [85]:
def test_sets(test_type):
    '''make test sets for data with known values'''
    if test_type == 'ind': #independent values
        a = np.random.binomial(1,0.5,10000)
        b = np.random.binomial(1,0.5,10000)
        both = pd.DataFrame({'a':a,'b':b})
    if test_type == 'all_same':
        a = [1] * 100
        b = [1] * 100
    if test_type == 'middle':
        a = np.random.binomial(1,0.5,10000)
        b = list()
        for i in a:
            if np.random.binomial(1,0.5):
                b.append(np.random.binomial(1,i))
            else:
                b.append(np.random.binomial(1,0.5))
        both = pd.DataFrame({'a':a,'b':b})
        return both
    else:
        raise 'unknown type'
    return both

In [95]:
df = test_sets('middle')
my_mi = mututal_information(df)

<h4> as a final test check to see how output compares to sk learn's MI function

In [103]:
from sklearn.metrics import mutual_info_score
sk_mi = mutual_info_score(df.a,df.b)
np.round(sk_mi,10) == np.round(my_mi,10) #check to see if both functions give  ...
#the same output out to 10 decimals

True