In [1]:
import pandas as pd
import numpy as np 

In [24]:
df = pd.read_csv('sample_data.csv')
df.head()

Unnamed: 0,Gender,Age,Marital_Status,Country_Birth,Race,Alcohol Average/Day,Ever Used Hard Drugs
0,F,30,Married,Other,Other Hispanic,3,No
1,F,30,Married,US,Non-Hispanic White,3,No
2,F,30,Never Married,Other,Non-Hispanic Black,2,No
3,F,30,Never Married,Other,Other,4,No
4,F,30,Never Married,US,Non-Hispanic Black,2,No


In [21]:
qis = ['Age','Gender']
df['DUMMY_COUNTER'] = df[qis[0]].apply(lambda x: 1)
df_grouped = df.groupby(qis).count()
df_grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,Marital_Status,Country_Birth,Race,Alcohol Average/Day,Ever Used Hard Drugs,DUMMY_COUNTER
Age,Gender,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
30,F,7,7,7,7,7,7
30,M,11,11,11,11,11,11
31,F,7,7,7,7,7,7
31,M,16,16,16,16,16,16
32,F,7,7,7,7,7,7
32,M,7,7,7,7,7,7


In [36]:
df_grouped['DUMMY_COUNTER'].tolist()

[7, 11, 7, 16, 7, 7]

In [28]:
df_grouped.get

Marital_Status          7
Country_Birth           7
Race                    7
Alcohol Average/Day     7
Ever Used Hard Drugs    7
DUMMY_COUNTER           7
Name: (30, F), dtype: int64

In [25]:
df2 = df.copy()
df2['BOO'] = df['Age'].apply(lambda x: 1)
df.head()

Unnamed: 0,Gender,Age,Marital_Status,Country_Birth,Race,Alcohol Average/Day,Ever Used Hard Drugs
0,F,30,Married,Other,Other Hispanic,3,No
1,F,30,Married,US,Non-Hispanic White,3,No
2,F,30,Never Married,Other,Non-Hispanic Black,2,No
3,F,30,Never Married,Other,Other,4,No
4,F,30,Never Married,US,Non-Hispanic Black,2,No


In [42]:
def get_k_anonymity(df,quasi_identifiers):
    """
    Function to return the minimum value of k for which a table satisfies k-Anonymity
    
    @Params:
        df: Pandas DataFrame which is to be tested 
        quasi_identifiers: List of attributes; must be a subset of the columns of df
        
    Returns:
        A python dictionary which consists of two items:
            k: The minimum value of k for which df satisfies k-Anonymity
            equivalence_classes: List of tuples; each tuple represents the equivalence class that satisfies k-Anonymity
    
    """
    assert set(quasi_identifiers).issubset(set(df.columns)), "One or more quasi identifiers is not in the data frame columns"
    df_local = df.copy()
    df_local['DUMMY_COUNTER'] = df_local[quasi_identifiers[0]].apply(lambda x: 1)
    df_grouped = df_local.groupby(quasi_identifiers).count()
    
    group_k = df_grouped['DUMMY_COUNTER'].tolist()
    k = np.min(group_k)
    
    df_grouped_index = df_grouped.index[df_grouped['DUMMY_COUNTER']==k].tolist()
    
    response = {'k':k, 
                'equivalence_classes':df_grouped_index}
    
    return response
    
    
    
    
   
    
get_k_anonymity(df, (df.columns).tolist())
    

{'k': 1,
 'equivalence_classes': [('F',
   30,
   'Married',
   'Other',
   'Other Hispanic',
   3,
   'No'),
  ('F', 30, 'Married', 'US', 'Non-Hispanic White', 3, 'No'),
  ('F', 30, 'Never Married', 'Other', 'Non-Hispanic Black', 2, 'No'),
  ('F', 30, 'Never Married', 'Other', 'Other', 4, 'No'),
  ('F', 30, 'Never Married', 'US', 'Non-Hispanic Black', 2, 'No'),
  ('F', 30, 'Never Married', 'US', 'Non-Hispanic Black', 4, 'No'),
  ('F', 30, 'Never Married', 'US', 'Non-Hispanic White', 2, 'No'),
  ('F', 31, 'Living W/ Partner', 'US', 'Non-Hispanic White', 4, 'Yes'),
  ('F', 31, 'Living W/ Partner', 'US', 'Other Hispanic', 5, 'No'),
  ('F', 31, 'Married', 'Other', 'Mexican American', 3, 'No'),
  ('F', 31, 'Married', 'Other', 'Other Hispanic', 1, 'No'),
  ('F', 31, 'Married', 'US', 'Mexican American', 1, 'No'),
  ('F', 31, 'Married', 'US', 'Other Hispanic', 2, 'No'),
  ('F', 31, 'Never Married', 'US', 'Non-Hispanic Black', 1, 'No'),
  ('F', 32, 'Living W/ Partner', 'US', 'Non-Hispanic Blac

In [49]:
def get_k_reverse_membership(df,equivalence_class):
    
    # Check that the QI are in columns 
    assert set(equivalence_class.keys()).issubset(set(df.columns)), "One or more quasi identifiers is not in the data frame columns"
    
    df_local  = df.copy()
    
    for qi in equivalence_class.keys():
        df_local = df_local[df_local[qi]==equivalence_class[qi]]
    
    
    return df_local.shape[0]
    
    
    
    
    
    

In [61]:
def get_full_k_anonymity_report(df, quasi_identifiers):
    """
    Function to find the minimum value of k for which each equivalence class of the given quasi_identifiers satisfies k-Anonymity, and return each equivalence class and corresponding value of k. 
    
    Params:
        df: Pandas DataFrame which is to be tested 
        quasi_identifiers: List of attributes; must be a subset of the columns of df
        
    Returns:
       
    """
    # Check that quasi_identifiers is a subset of dataframe columns
    assert set(quasi_identifiers).issubset(set(df.columns)), "One or more quasi identifiers is not in the data frame columns"
    
    # Make a copy for local manipulation
    df_local = df.copy()
    
    # Add a summy column used for aggregation
    # This is needed to access the aggregate in case quasi_identifiers are all of the dataframe columns
    df_local['DUMMY_COUNTER'] = df_local[quasi_identifiers[0]].apply(lambda x: 1)
    
    # Group by quasi identifiers 
    df_grouped = df_local.groupby(quasi_identifiers).count().reset_index()
    equivalence_classes = []
    k_values = []
    
    for i in range(df_grouped.shape[0]):
        record = df_grouped.iloc[i]
        equivalence_class = tuple(record[quasi_identifiers])
        k = record['DUMMY_COUNTER']
        equivalence_classes.append(equivalence_class)
        k_values.append(k)
        
    return equivalence_classes, k_values
        
    
        

In [62]:
a,b = get_full_k_anonymity_report(df, ['Age','Gender'])

In [63]:
a

[(30, 'F'), (30, 'M'), (31, 'F'), (31, 'M'), (32, 'F'), (32, 'M')]

In [64]:
b

[7, 11, 7, 16, 7, 7]