# Anonypy

In [9]:
pip install category-encoders==2.6.2

Note: you may need to restart the kernel to use updated packages.


In [10]:
pip install category-encoders==2.6.2

Note: you may need to restart the kernel to use updated packages.


In [11]:
pip install pandas==2.1.1

Collecting pandas==2.1.1
  Downloading pandas-2.1.1-cp311-cp311-macosx_11_0_arm64.whl (10.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting tzdata>=2022.1 (from pandas==2.1.1)
  Downloading tzdata-2023.3-py2.py3-none-any.whl (341 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m341.8/341.8 kB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tzdata, pandas
  Attempting uninstall: pandas
    Found existing installation: pandas 1.5.3
    Uninstalling pandas-1.5.3:
      Successfully uninstalled pandas-1.5.3
Successfully installed pandas-2.1.1 tzdata-2023.3
Note: you may need to restart the kernel to use updated packages.


In [12]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd

In [13]:
def is_k_anonymous(partition, k):
    if len(partition) < k:
        return False
    return True


def is_l_diverse(df, partition, sensitive_column, l):
    diversity = len(df.loc[partition][sensitive_column].unique())
    return diversity >= l


def is_t_close(df, partition, sensitive_column, global_freqs, p):
    total_count = float(len(partition))
    d_max = None
    group_counts = (
        df.loc[partition].groupby(sensitive_column)[sensitive_column].agg("count")
    )
    for value, count in group_counts.to_dict().items():
        p = count / total_count
        d = abs(p - global_freqs[value])
        if d_max is None or d > d_max:
            d_max = d
    return d_max <= p


def get_global_freq(df, sensitive_column):
    global_freqs = {}
    total_count = float(len(df))
    group_counts = df.groupby(sensitive_column)[sensitive_column].agg("count")

    for value, count in group_counts.to_dict().items():
        p = count / total_count
        global_freqs[value] = p
    return global_freqs


In [14]:
class Mondrian:
    def __init__(self, df, feature_columns, sensitive_column=None):
        self.df = df
        self.feature_columns = feature_columns
        self.sensitive_column = sensitive_column

    def is_valid(self, partition, k=2, l=0, p=0.0):
        # k-anonymous
        if not is_k_anonymous(partition, k):
            return False
        # l-diverse
        if l > 0 and self.sensitive_column is not None:
            diverse = is_l_diverse(
                self.df, partition, self.sensitive_column, l
            )
            if not diverse:
                return False
        # t-close
        if p > 0.0 and self.sensitive_column is not None:
            global_freqs = get_global_freq(self.df, self.sensitive_column)
            close = is_t_close(
                self.df, partition, self.sensitive_column, global_freqs, p
            )
            if not close:
                return False

        return True

    def get_spans(self, partition, scale=None):
        spans = {}
        for column in self.feature_columns:
            if self.df[column].dtype.name == "category":
                span = len(self.df[column][partition].unique())
            else:
                span = (
                    self.df[column][partition].max() - self.df[column][partition].min()
                )
            if scale is not None:
                span = span / scale[column]
            spans[column] = span
        return spans

    def split(self, column, partition):
        dfp = self.df[column][partition]
        if dfp.dtype.name == "category":
            values = dfp.unique()
            lv = set(values[: len(values) // 2])
            rv = set(values[len(values) // 2 :])
            return dfp.index[dfp.isin(lv)], dfp.index[dfp.isin(rv)]
        else:
            median = dfp.median()
            dfl = dfp.index[dfp < median]
            dfr = dfp.index[dfp >= median]
            return (dfl, dfr)

    def partition(self, k=3, l=0, p=0.0):
        scale = self.get_spans(self.df.index)

        finished_partitions = []
        partitions = [self.df.index]
        while partitions:
            partition = partitions.pop(0)
            spans = self.get_spans(partition, scale)
            for column, span in sorted(spans.items(), key=lambda x: -x[1]):
                lp, rp = self.split(column, partition)
                if not self.is_valid(lp, k, l, p) or not self.is_valid(rp, k, l, p):
                    continue
                partitions.extend((lp, rp))
                break
            else:
                finished_partitions.append(partition)
        return finished_partitions


In [15]:
class Preserver:
    def __init__(self, df, feature_columns, sensitive_column):
        self.modrian = Mondrian(df, feature_columns, sensitive_column)

    def __anonymize(self, k, l=0, p=0.0):
        partitions = self.modrian.partition(k, l, p)
        return anonymize(
            self.modrian.df,
            partitions,
            self.modrian.feature_columns,
            self.modrian.sensitive_column,
        )

    def anonymize_k_anonymity(self, k):
        return self.__anonymize(k)

    def anonymize_l_diversity(self, k, l):
        return self.__anonymize(k, l=l)

    def anonymize_t_closeness(self, k, p):
        return self.__anonymize(k, p=p)

    def __count_anonymity(self, k, l=0, p=0.0):
        partitions = self.modrian.partition(k, l, p)
        return count_anonymity(
            self.modrian.df,
            partitions,
            self.modrian.feature_columns,
            self.modrian.sensitive_column,
        )

    def count_k_anonymity(self, k):
        return self.__count_anonymity(k)

    def count_l_diversity(self, k, l):
        return self.__count_anonymity(k, l=l)

    def count_t_closeness(self, k, p):
        return self.__count_anonymity(k, p=p)


def agg_categorical_column(series):
    # this is workaround for dtype bug of series
    series.astype("category")

    l = [str(n) for n in set(series)]
    return [",".join(l)]


def agg_numerical_column(series):
    # return [series.mean()]
    minimum = series.min()
    maximum = series.max()
    if maximum == minimum:
        string = str(maximum)
    else:
        string = f"{minimum}-{maximum}"
    return [string]


def anonymize(df, partitions, feature_columns, sensitive_column, max_partitions=None):
    aggregations = {}
    for column in feature_columns:
        if df[column].dtype.name == "category":
            aggregations[column] = agg_categorical_column
        else:
            aggregations[column] = agg_numerical_column
    rows = []
    for i, partition in enumerate(partitions):
        if max_partitions is not None and i > max_partitions:
            break
        grouped_columns = df.loc[partition].agg(aggregations, squeeze=False)
        sensitive_counts = (
            df.loc[partition].groupby(sensitive_column).agg({sensitive_column: "count"})
        )
        values = grouped_columns.apply(lambda x: x[0]).to_dict()
        for sensitive_value, count in sensitive_counts[sensitive_column].items():
            if count == 0:
                continue
            values.update(
                {
                    sensitive_column: sensitive_value,
                    "count": count,
                }
            )
            rows.append(values.copy())
    return rows


def count_anonymity(
    df, partitions, feature_columns, sensitive_column, max_partitions=None
):
    aggregations = {}
    for column in feature_columns:
        if df[column].dtype.name == "category":
            aggregations[column] = agg_categorical_column
        else:
            aggregations[column] = agg_numerical_column
    aggregations[sensitive_column] = "count"
    rows = []
    for i, partition in enumerate(partitions):
        if max_partitions is not None and i > max_partitions:
            break
        grouped_columns = df.loc[partition].agg(aggregations, squeeze=False)
        values = grouped_columns.apply(lambda x: x[0]).to_dict()
        rows.append(values.copy())
    return rows


In [16]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KDTree
import category_encoders as ce


class RecordLinkage:
    def __init__(self, df, knowledge):
        self.df = df
        self.knowledge = knowledge

        categories = (df.dtypes == "object").keys().to_list()
        self.enc = ce.OneHotEncoder(cols=categories, drop_invariant=False)
        df_concat = pd.concat([self.df, self.knowledge], ignore_index=True)
        self.enc.fit(df_concat)

    def execute(self, k=3):
        enc_df = self.enc.transform(self.df).astype("float64").values
        enc_knowledge = self.enc.transform(self.knowledge).astype("float64").values

        tree = KDTree(enc_df)
        dist, index = tree.query(enc_knowledge, k=k)
        return dist, index


def attack(df, knowledge):
    k = 3
    a = RecordLinkage(df, knowledge)
    dist, index = a.execute(k)

    di = pd.DataFrame(np.hstack((index, dist)))

    di.loc[di[3] > di[3].median(), :] = -1
    # Display the top three
    return di.iloc[:, 0:k].astype(int)


# Anonymize data with K Anonymity 
Cardinality of any query result on released data should be at least k i.e. each equivalence class should encompass atleast k entries

In [17]:
# Load the dataset
df = pd.read_csv("Real-like Fake Data.csv")
df.head(10)

Unnamed: 0,First name,Last name,Age,Gender,Household-Income,Household-Zone,State,Car-Make,Car-Model,Car-Colour,Loan-Amount
0,Ikey,Dowty,30,Male,114000,Highly Rural,NY,Porsche,911,Purple,42000
1,Magda,Nealand,45,Female,113000,Rural,NV,Porsche,911,Purple,50000
2,Natalya,Offner,41,Female,153000,Highly Rural,TX,Porsche,911,Red,15000
3,Yasmin,Findlow,46,Female,238000,Suburban,NJ,Porsche,911,Blue,91000
4,Ericha,Dressel,47,Female,133000,Highly Rural,FL,Mercedes-Benz,300E,Green,95000
5,Bibby,Evershed,91,Female,133000,Suburban,NY,Mercedes-Benz,400SEL,Red,55000
6,Domenico,Cadman,57,Male,108000,Highly Urban,NY,Mercedes-Benz,400SEL,Red,47000
7,Dorisa,Skill,37,Female,217000,Highly Rural,CT,Mercedes-Benz,400SEL,Green,99000
8,Tann,Lever,45,Male,134000,Rural,PA,Mercedes-Benz,400SEL,Yellow,50000
9,Corby,Van Der Straaten,28,Male,122000,Suburban,CA,Mercedes-Benz,500SEL,Red,86000


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   First name        250 non-null    object
 1   Last name         250 non-null    object
 2   Age               250 non-null    int64 
 3   Gender            250 non-null    object
 4   Household-Income  250 non-null    int64 
 5   Household-Zone    250 non-null    object
 6   State             250 non-null    object
 7   Car-Make          250 non-null    object
 8   Car-Model         250 non-null    object
 9   Car-Colour        250 non-null    object
 10  Loan-Amount       250 non-null    int64 
dtypes: int64(3), object(8)
memory usage: 21.6+ KB


In [19]:
# Drop first name and last name since they are PII
df = df.drop(["First name", "Last name"], axis=1)
df.head()

Unnamed: 0,Age,Gender,Household-Income,Household-Zone,State,Car-Make,Car-Model,Car-Colour,Loan-Amount
0,30,Male,114000,Highly Rural,NY,Porsche,911,Purple,42000
1,45,Female,113000,Rural,NV,Porsche,911,Purple,50000
2,41,Female,153000,Highly Rural,TX,Porsche,911,Red,15000
3,46,Female,238000,Suburban,NJ,Porsche,911,Blue,91000
4,47,Female,133000,Highly Rural,FL,Mercedes-Benz,300E,Green,95000


In [20]:
# Identify categorical columns and modify datatype to category
categorical = [
    "Gender",
    "Household-Zone",
    "State",
    "Car-Make",
    "Car-Model",
    "Car-Colour"
]

for name in categorical:
    df[name] = df[name].astype("category")
    
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   Age               250 non-null    int64   
 1   Gender            250 non-null    category
 2   Household-Income  250 non-null    int64   
 3   Household-Zone    250 non-null    category
 4   State             250 non-null    category
 5   Car-Make          250 non-null    category
 6   Car-Model         250 non-null    category
 7   Car-Colour        250 non-null    category
 8   Loan-Amount       250 non-null    int64   
dtypes: category(6), int64(3)
memory usage: 12.6 KB


In [21]:
# Convert loan value to loan rating
df["Loan-Rating"] = pd.cut(df["Loan-Amount"], 5, labels=["Very low", "Low", "Moderate", "High", "Very high"])
df["Loan-Rating"].value_counts()

Low          69
Moderate     49
Very high    45
High         44
Very low     43
Name: Loan-Rating, dtype: int64

In [22]:
# Identify feature columns and sensitive colum
feature_columns = [
    "Gender", 
    "Age",
    "Household-Income"
]

sensitive_column = "Loan-Rating"

In [23]:
# Perform anonymization using K anonymity
# k = min number of entries in equivalence class

p = Preserver(df, feature_columns, sensitive_column)
rows = p.anonymize_k_anonymity(k=3)

dfn = pd.DataFrame(rows)
dfn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183 entries, 0 to 182
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Gender            183 non-null    object
 1   Age               183 non-null    object
 2   Household-Income  183 non-null    object
 3   Loan-Rating       183 non-null    object
 4   count             183 non-null    int64 
dtypes: int64(1), object(4)
memory usage: 7.3+ KB


In [24]:
dfn.head(40)

Unnamed: 0,Gender,Age,Household-Income,Loan-Rating,count
0,Male,21-36,54000-61000,Very low,2
1,Male,21-36,54000-61000,Very high,1
2,Male,23-37,65000-107000,Low,1
3,Male,23-37,65000-107000,High,1
4,Male,23-37,65000-107000,Very high,2
5,Male,45-51,49000-54000,Very low,1
6,Male,45-51,49000-54000,Moderate,1
7,Male,45-51,49000-54000,Very high,1
8,Male,40-45,62000-106000,Very low,1
9,Male,40-45,62000-106000,Low,1


# Evaluation of k-anonymity
- Background Knowledge Attack: Fails since there is no way to identify sensitive information
- Homogeneity Attack: Possible if the entire equivalence class has the same sensitive data 🥲

In [25]:
dfn[dfn["count"] >= 4]

Unnamed: 0,Gender,Age,Household-Income,Loan-Rating,count
38,Male,74-83,94000-134000,Very high,4


# L Diversity
K diversity with aleast l values for sensitive information in each equivalence class.The l-diversity model adds the promotion of intra-group diversity for sensitive values in the anonymization mechanism.

In [26]:
# Perform anonymization using K anonymity with L diversity
# k = min number of entries in equivalence class
# l = min number of unique sensitive values in equivalence class

rows = p.anonymize_l_diversity(k=3, l=3)

dfn_l = pd.DataFrame(rows)

In [27]:
dfn_l.head(10)

Unnamed: 0,Gender,Age,Household-Income,Loan-Rating,count
0,Male,21-37,54000-107000,Very low,2
1,Male,21-37,54000-107000,Low,1
2,Male,21-37,54000-107000,High,1
3,Male,21-37,54000-107000,Very high,3
4,Male,74-94,47000-80000,Low,2
5,Male,74-94,47000-80000,Moderate,3
6,Male,74-94,47000-80000,High,4
7,Male,74-93,88000-136000,Low,2
8,Male,74-93,88000-136000,Moderate,2
9,Male,74-93,88000-136000,Very high,5


In [28]:
dfn_l.tail(10)

Unnamed: 0,Gender,Age,Household-Income,Loan-Rating,count
161,Female,65-73,212000-245000,Low,2
162,Female,65-73,212000-245000,Moderate,1
163,Female,65-73,212000-245000,High,1
164,Female,86-94,173000-207000,Very low,2
165,Female,86-94,173000-207000,Low,1
166,Female,86-94,173000-207000,Very high,2
167,Female,74-83,215000-240000,Very low,1
168,Female,74-83,215000-240000,Moderate,1
169,Female,74-83,215000-240000,High,2
170,Female,74-83,215000-240000,Very high,1


# T Closeness
l-diverse method produces partitions that contain a very large number of entries for one value of the sensitive attribute and only one entry for the other value. This is not ideal as while there is "plausible deniability" for a person in the dataset but an adversary can still be very certain about the person's attribute value in that case.

t-closeness solves this problem by making sure that the distribution of sensitive attribute values in a given partition is similar to the distribution of the values in the overall dataset.

In [29]:
# Perform anonymization using K anonymity with L diversity
# k = min number of entries in equivalence class
# p = distribution of the sensitive info in equivalence class

rows = p.anonymize_t_closeness(k=3, p=0.5)

In [30]:
dfn_t = pd.DataFrame(rows)
dfn_t

Unnamed: 0,Gender,Age,Household-Income,Loan-Rating,count
0,Male,51-97,46000-243000,Very low,7
1,Male,51-97,46000-243000,Low,25
2,Male,51-97,46000-243000,Moderate,13
3,Male,51-97,46000-243000,High,10
4,Male,51-97,46000-243000,Very high,9
...,...,...,...,...,...
57,Female,25-46,226000-238000,High,1
58,Female,25-46,226000-238000,Very high,1
59,Female,25-46,242000-248000,Moderate,2
60,Female,25-46,242000-248000,High,1
