In [2]:
from sklearn.base import BaseEstimator, TransformerMixin

In [3]:
def indices_of_top_k(arr, k):
    return np.sort(np.argpartition(np.array(arr), -k)[-k:])

class DataFrameSelector(BaseEstimator,TransformerMixin):
    def __init__(self,attribute_names):
        self.attribute_names=attribute_names
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        return X[self.attribute_names].values
    
class TopFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, feature_importances, k):
        self.feature_importances = feature_importances
        self.k = k
    def fit(self, X, y=None):
        self.feature_indices_ = indices_of_top_k(self.feature_importances, self.k)
        return self
    def transform(self, X):
        return X[:, self.feature_indices_]

def get_dict(s):
    try:
        d = eval(s)
    except:
        d = {}
    return d

def GKTau(data_frame,feature_1,feature_2):
    grouped_variables = data_frame.groupby([feature_1,feature_2],as_index=False).size()
    grouped_variables.rename(columns={'size':'Counts'},inplace=True)
    grouped_variables['pi_i_j'] = grouped_variables['Counts']/grouped_variables['Counts'].sum()

    for_i_plus = grouped_variables.groupby([feature_1],as_index = False)['pi_i_j'].sum()
    for_i_plus['i_value'] = np.arange(for_i_plus.shape[0])
    for_i_plus.rename(columns={'pi_i_j':'pi_i_plus'},inplace=True)

    for_j_plus = grouped_variables.groupby([feature_2],as_index = False)['pi_i_j'].sum()
    for_j_plus['j_value'] = np.arange(for_j_plus.shape[0])
    for_j_plus.rename(columns={'pi_i_j':'pi_j_plus'},inplace=True)

    merge_1 = pd.merge(grouped_variables,for_i_plus,on = feature_1)
    final_table = pd.merge(merge_1,for_j_plus,on = feature_2)
    i_values = for_i_plus.shape[0]
    j_values = for_j_plus.shape[0]
    final_table = final_table.sort_values(feature_1)

    rows_present = for_i_plus.shape[0]
    columns_present = for_j_plus.shape[0]
    matrix_new =np.zeros(shape=(rows_present,columns_present))

    for i in range(len(for_i_plus)):
        for j in range(len(for_j_plus)):
            rows_extracted = final_table.loc[(final_table['i_value'] == i) & (final_table['j_value'] == j)]
            if rows_extracted.empty == True:
                value_1 = 0.00
                value_1=value_1
                matrix_new[i,j]=value_1
            else:    
                expected_1=(rows_extracted['pi_i_j']*rows_extracted['pi_i_j'])/(rows_extracted['pi_i_plus'])
                value_1= expected_1.astype(float)
                matrix_new[i,j]=value_1

    expected_y_x= 1-np.sum(matrix_new)
    vy = 1- np.sum(for_j_plus['pi_j_plus']*for_j_plus['pi_j_plus'])
    
    # Denominator has consstant 10^(-10) added to prevent division by 0 which leads to 
    # "invalid value encountered in double_scalars" warning
    tau = (vy - expected_y_x)/(vy + 10 ** -10)
    return(tau)