## Anonmymizing the customer_information.csv and calculating the k-anonymity of the new dataset

Importing necessary packages

In [20]:
import pandas as pd
import numpy as np
import hashlib
import re
import os
import math
from geopy.geocoders import Nominatim
from functions import *

Loading the original dataset

In [21]:
# Read in data to be anonymised
data = pd.read_csv("Data/customer_information.csv")

# Create anon_data variable as initial data with unneeded direct identifiers dropped
#anon_data = data.drop(['given_name', 'surname', 'phone_number', 'national_insurance_number', 'bank_account_number'], axis=1)
anon_data = pd.DataFrame()

In [22]:
# Clean NIN formatting and assign Sample ID as a hashed form of the NIN
key = os.urandom(16)
data["national_insurance_number"], anon_data['Sample.ID'], salts = zip(*data["national_insurance_number"].apply(
    lambda x: hash(key, re.sub(r'(.{2})(?!$)','\\1 ', x.replace(' ', '') ))))

In [23]:
# Create a reference table between NIN and respective hashed NIN
reference_table = pd.DataFrame()
reference_table['Hashed.NIN'] = anon_data['Sample.ID']
reference_table['Salt'] = salts
reference_table['Key'] = key.hex()
reference_table['NIN'] = data['national_insurance_number']

In [24]:
# Assign gender
anon_data['Gender'] = data['gender']

# Assign birthdate as banded birthyears
# Select the birth year only
birthyears = pd.DatetimeIndex(data['birthdate']).year
# Band the birth years into 5-year intervals
anon_data['Birthyear'] = pd.cut(birthyears, np.arange(birthyears.min(), birthyears.max()+20, 20), right=False)

# Assign postcode as truncated postcode
anon_data['Postcode'] = data['postcode'].apply(lambda x: re.search('[a-zA-Z]*', x).group(0))

#anon_data['Region'] = data['postcode'].apply(lambda x: postcode_to_region(x))
postcode_dictionary = pd.read_csv('Data/postcode_region.csv')
anon_data = pd.merge(anon_data, postcode_dictionary.iloc[: , :-1], on='Postcode', how='left')
#anon_data['Region'] = anon_data['Postcode'].to_frame().merge(postcode_dictionary.iloc[: , :-1], on='Postcode', how='left')

In [25]:
# Assign weight and height as banded weights and heights
#anon_data['Weight'] = pd.cut(data['weight'], np.arange(math.floor(data['weight'].min()), math.floor(data['weight'].max()+20), 20), right=False)
weight_noise = np.random.normal(0,1,1000)*5
anon_data['Weight'] = round(data['weight']+weight_noise, 1)


In [26]:
# Round minimum and maximum heights to nearest one-fifth prior to making bins
#anon_data['Height'] = pd.cut(data['height'], np.arange(round(data['height'].min()*5)/5, (round(data['height'].max()*5)/5)+0.2, 0.2), right=False)
height_noise = np.random.normal(0,1,1000)/5
anon_data['Height'] = round(data['height']+height_noise, 2)
bmi = data['weight'] / data['height']**2
print(anon_data['Height'])
print(data['height'])#anon_data['BMI'] = pd.cut(bmi, bins=[math.floor(bmi.min()), 18.5, 25, 30, round(bmi.max(), -1)], right=False)
#print(anon_data['BMI'])
# Assign avg_drinks as banded avg_drinks
#anon_data['avg_n_drinks_per_week'] = pd.cut(anon_data['avg_n_drinks_per_week'], np.linspace(0, 2, 9))


0      1.78
1      1.36
2      1.98
3      1.41
4      1.45
       ... 
995    1.71
996    1.79
997    1.99
998    1.31
999    1.60
Name: Height, Length: 1000, dtype: float64
0      1.73
1      1.74
2      1.88
3      1.56
4      1.81
       ... 
995    1.98
996    1.85
997    2.00
998    1.50
999    1.65
Name: height, Length: 1000, dtype: float64


In [27]:
import country_converter as coco
from geopy.geocoders import Nominatim


def parse_country(country_name):
    country = coco.convert(country_name, to='name_short', include_obsolete=True)
    if(country_name == 'Svalbard & Jan Mayen Islands'): # Hard-coded fix for the unmatched country
        country = 'Norway'
    return country

def country_to_hemisphere2(country_name):
    clean_country = parse_country(country_name)
    try:
        if(country_name == "Micronesia"): # Hard-coded fix for territory, cannot match as countr
            hemisphere = "Southern Hemisphere"
        else:
            hemisphere = ("Southern" if Nominatim(user_agent="CDM").geocode((parse_country(country_name))).latitude < 0 else "Northern") + " Hemisphere"
        return hemisphere
    except:
        print("Input:",country_name,", output:", "Error")
        return "Error"
    
# Assign education level as banded education level
anon_data['Education.Level'] = data['education_level'].map(lambda x: "Higher" if x in ["bachelor", "masters", "phD"] else "BasicOther")

anon_data['Location.of.Birth'] = data['country_of_birth'].apply(lambda x: country_to_hemisphere2(x))

# Output the files
anon_data.to_csv("output.csv", sep=",", index=None)
reference_table.to_csv("reference_table.csv", sep=",", index=None)

Input: Micronesia , output: Error


In [28]:
anon_data


Unnamed: 0,Sample.ID,Gender,Birthyear,Postcode,Region,Region.1,Weight,Height,Education.Level,Location.of.Birth
0,86dc5b83e5336f539043a6411e4812abd9cf1f2d6ba266...,F,"[1975, 1995)",LS,England,North East,76.7,1.78,Higher,Northern Hemisphere
1,701dc8acfc808dd47c4a49a734561685c4a54e5b765885...,M,"[1995, 2015)",M,England,North West,62.5,1.36,BasicOther,Northern Hemisphere
2,a9de10ea50c1f274761304b1807baa18f2f02d5c3849d4...,F,"[1975, 1995)",SO,England,South East,101.0,1.98,Higher,Northern Hemisphere
3,98fde3f4f52e6be6688a00776a69dbbbb1683e2380236a...,F,"[1995, 2015)",B,England,West Midlands,57.5,1.41,BasicOther,Northern Hemisphere
4,cf6ecece0617faacc5d9966015fd5da67ab4939e007c90...,F,"[1955, 1975)",TQ,England,South West,92.9,1.45,BasicOther,Southern Hemisphere
...,...,...,...,...,...,...,...,...,...,...
995,f00999557e70e5279a69048ba331bdc68d03ec08b40431...,M,"[1955, 1975)",SA,Other,Wales,97.7,1.71,BasicOther,Northern Hemisphere
996,5fb67daf71702a91fad102390bc94c22bcad88c86f6feb...,M,"[1995, 2015)",TS,England,North East,56.5,1.79,BasicOther,Northern Hemisphere
997,fffaad19de06d97d7822cc42965f00d05733862130e8bd...,F,"[1955, 1975)",G,Other,Scotland,89.9,1.99,BasicOther,Southern Hemisphere
998,eb85b796391facdb61d959af11e5c4cd13268362641f1f...,F,"[1955, 1975)",CT,England,South East,76.1,1.31,Higher,Northern Hemisphere


In [29]:
# Calculating K-anonymity, 2 anonymous dataset
k = 2
############
# Checking k-anonymity, fix this for variable names
df_count = anon_data.groupby(['Gender', 'Birthyear', 'Education.Level', 'Region', 'Location.of.Birth']).size().reset_index(name = 'Count') 
print(df_count)
print(df_count[df_count['Count']==1])
# print(df_count.size().describe())

# print(anon_data.groupby('Education.Level').size())
# print(anon_data.groupby('Continent.of.Birth').size())
# print(anon_data.groupby('Region').size())

   Gender     Birthyear Education.Level   Region    Location.of.Birth  Count
0       F  [1955, 1975)      BasicOther  England                Error      0
1       F  [1955, 1975)      BasicOther  England  Northern Hemisphere     77
2       F  [1955, 1975)      BasicOther  England  Southern Hemisphere     37
3       F  [1955, 1975)      BasicOther    Other                Error      0
4       F  [1955, 1975)      BasicOther    Other  Northern Hemisphere     12
..    ...           ...             ...      ...                  ...    ...
67      M  [1995, 2015)          Higher  England  Northern Hemisphere     19
68      M  [1995, 2015)          Higher  England  Southern Hemisphere      5
69      M  [1995, 2015)          Higher    Other                Error      0
70      M  [1995, 2015)          Higher    Other  Northern Hemisphere      5
71      M  [1995, 2015)          Higher    Other  Southern Hemisphere      2

[72 rows x 6 columns]
   Gender     Birthyear Education.Level   Region Loca