## Anonmymizing the customer_information.csv and calculating the k-anonymity of the new dataset

Importing necessary packages

In [1]:
import pandas as pd
import numpy as np
import hashlib
import re
import os
import math
from geopy.geocoders import Nominatim
from functions import *

Loading the original dataset

In [2]:
# Read in data to be anonymised
data = pd.read_csv("Data/customer_information.csv")

# Create anon_data variable as initial data with unneeded direct identifiers dropped
#anon_data = data.drop(['given_name', 'surname', 'phone_number', 'national_insurance_number', 'bank_account_number'], axis=1)
anon_data = pd.DataFrame()

In [4]:
# Clean NIN formatting and assign Sample ID as a hashed form of the NIN
key = os.urandom(16)
data["national_insurance_number"], anon_data['Sample.ID'], salts = zip(*data["national_insurance_number"].apply(
    lambda x: hash(key, re.sub(r'(.{2})(?!$)','\\1 ', x.replace(' ', '') ))))

In [5]:
# Create a reference table between NIN and respective hashed NIN
reference_table = pd.DataFrame()
reference_table['Hashed.NIN'] = anon_data['Sample.ID']
reference_table['Salt'] = salts
reference_table['Key'] = key.hex()
reference_table['NIN'] = data['national_insurance_number']

In [6]:
# Assign gender
anon_data['Gender'] = data['gender']

# Assign birthdate as banded birthyears
# Select the birth year only
birthyears = pd.DatetimeIndex(data['birthdate']).year
# Band the birth years into 5-year intervals
anon_data['Birthyear'] = pd.cut(birthyears, np.arange(birthyears.min(), birthyears.max()+20, 20), right=False)

# Assign postcode as truncated postcode
anon_data['Postcode'] = data['postcode'].apply(lambda x: re.search('[a-zA-Z]*', x).group(0))

#anon_data['Region'] = data['postcode'].apply(lambda x: postcode_to_region(x))
postcode_dictionary = pd.read_csv('Data/postcode_region.csv')
anon_data = pd.merge(anon_data, postcode_dictionary.iloc[: , :-1], on='Postcode', how='left')
#anon_data['Region'] = anon_data['Postcode'].to_frame().merge(postcode_dictionary.iloc[: , :-1], on='Postcode', how='left')

In [7]:
# Assign weight and height as banded weights and heights
#anon_data['Weight'] = pd.cut(data['weight'], np.arange(math.floor(data['weight'].min()), math.floor(data['weight'].max()+20), 20), right=False)
weight_noise = np.random.normal(0,1,1000)*5
anon_data['Weight'] = round(data['weight']+weight_noise, 1)


In [8]:
# Round minimum and maximum heights to nearest one-fifth prior to making bins
#anon_data['Height'] = pd.cut(data['height'], np.arange(round(data['height'].min()*5)/5, (round(data['height'].max()*5)/5)+0.2, 0.2), right=False)
height_noise = np.random.normal(0,1,1000)/5
anon_data['Height'] = round(data['height']+height_noise, 2)
bmi = data['weight'] / data['height']**2
print(anon_data['Height'])
print(data['height'])#anon_data['BMI'] = pd.cut(bmi, bins=[math.floor(bmi.min()), 18.5, 25, 30, round(bmi.max(), -1)], right=False)
#print(anon_data['BMI'])
# Assign avg_drinks as banded avg_drinks
#anon_data['avg_n_drinks_per_week'] = pd.cut(anon_data['avg_n_drinks_per_week'], np.linspace(0, 2, 9))


0      1.46
1      1.63
2      1.96
3      1.65
4      2.03
       ... 
995    1.65
996    1.60
997    2.16
998    1.22
999    1.41
Name: Height, Length: 1000, dtype: float64
0      1.73
1      1.74
2      1.88
3      1.56
4      1.81
       ... 
995    1.98
996    1.85
997    2.00
998    1.50
999    1.65
Name: height, Length: 1000, dtype: float64


In [None]:
import country_converter as coco
from geopy.geocoders import Nominatim


def parse_country(country_name):
    country = coco.convert(country_name, to='name_short', include_obsolete=True)
    if(country_name == 'Svalbard & Jan Mayen Islands'): # Hard-coded fix for the single unmatched country
        country = 'Norway'
    return country

def country_to_hemisphere2(country_name):
    clean_country = parse_country(country_name)
    try:
        hemisphere = ("Southern" if Nominatim(user_agent="CDM").geocode((parse_country(country_name))).latitude < 0 else "Northern") + " Hemisphere"
        print("Input:",country_name,", output:", hemisphere)
        return hemisphere
    except:
        print("Input:",country_name,", output:", "Error")
        return "Error"
    
# Assign education level as banded education level
anon_data['Education.Level'] = data['education_level'].map(lambda x: "Higher" if x in ["bachelor", "masters", "phD"] else "BasicOther")

anon_data['Location.of.Birth'] = data['country_of_birth'].apply(lambda x: country_to_hemisphere2(x))

# Output the files
anon_data.to_csv("output.csv", sep=",", index=None)
reference_table.to_csv("reference_table.csv", sep=",", index=None)

Input: Armenia , output: Northern Hemisphere
Input: Northern Mariana Islands , output: Northern Hemisphere
Input: Venezuela , output: Northern Hemisphere
Input: Eritrea , output: Northern Hemisphere
Input: Ecuador , output: Southern Hemisphere
Input: Argentina , output: Southern Hemisphere
Input: Cook Islands , output: Southern Hemisphere
Input: Saint Vincent and the Grenadines , output: Northern Hemisphere
Input: Kiribati , output: Northern Hemisphere
Input: Belize , output: Northern Hemisphere
Input: Tokelau , output: Southern Hemisphere
Input: French Guiana , output: Northern Hemisphere
Input: Saint Lucia , output: Northern Hemisphere
Input: Congo , output: Northern Hemisphere
Input: Korea , output: Northern Hemisphere
Input: Syrian Arab Republic , output: Northern Hemisphere
Input: American Samoa , output: Southern Hemisphere
Input: Palestinian Territory , output: Error
Input: Saint Barthelemy , output: Northern Hemisphere
Input: New Zealand , output: Southern Hemisphere
Input: Tur

Input: Cook Islands , output: Southern Hemisphere
Input: Nepal , output: Northern Hemisphere
Input: Liechtenstein , output: Northern Hemisphere
Input: Indonesia , output: Southern Hemisphere
Input: Bulgaria , output: Northern Hemisphere
Input: Sudan , output: Northern Hemisphere
Input: Iran , output: Northern Hemisphere
Input: Cuba , output: Northern Hemisphere
Input: Bolivia , output: Southern Hemisphere
Input: Cuba , output: Northern Hemisphere
Input: South Georgia and the South Sandwich Islands , output: Southern Hemisphere
Input: Korea , output: Northern Hemisphere
Input: Tokelau , output: Southern Hemisphere
Input: Mayotte , output: Southern Hemisphere
Input: Faroe Islands , output: Northern Hemisphere
Input: Northern Mariana Islands , output: Northern Hemisphere
Input: Dominica , output: Northern Hemisphere
Input: Maldives , output: Northern Hemisphere
Input: Bahamas , output: Northern Hemisphere
Input: Seychelles , output: Southern Hemisphere
Input: Maldives , output: Northern H

In [10]:
anon_data


Unnamed: 0,Sample.ID,Gender,Birthyear,Postcode,Region,Region.1,Weight,Height,Education.Level,Location.of.Birth
0,ee67e166d771b9833297d2462e6691543f6309fa71a5d6...,F,"[1975, 1995)",LS,England,North East,75.8,1.46,Higher,NorthernHemisphere
1,aa5f5b562b6be1485916db8db930b2e2b12b074eb5e38c...,M,"[1995, 2015)",M,England,North West,70.6,1.63,BasicOther,NorthernHemisphere
2,e23565e1ed5b66f608eeb99ee6bf1df0a14280a4463b82...,F,"[1975, 1995)",SO,England,South East,90.1,1.96,Higher,NorthernHemisphere
3,c24b60cb3d367c889815e555f07364b06a8bc351f4d367...,F,"[1995, 2015)",B,England,West Midlands,72.2,1.65,BasicOther,NorthernHemisphere
4,5219529cd977d2bc99ee9a334c8665356eccc62ce82a22...,F,"[1955, 1975)",TQ,England,South West,99.8,2.03,BasicOther,SouthernHemisphere
...,...,...,...,...,...,...,...,...,...,...
995,df172e50ee41341f04ff66230312553d3b5bd3f1c0627e...,M,"[1955, 1975)",SA,Other,Wales,94.3,1.65,BasicOther,NorthernHemisphere
996,a8a7231c3c30e4ff124d10d2744a7f1fc7c60a2e0d594a...,M,"[1995, 2015)",TS,England,North East,49.0,1.60,BasicOther,NorthernHemisphere
997,80e2ae8059de48b1b88a89b3d93c28c334b706677e041f...,F,"[1955, 1975)",G,Other,Scotland,96.1,2.16,BasicOther,SouthernHemisphere
998,cdc34ae97ef595ab38ec4e4c8ed54af2cdabf97d2b534f...,F,"[1955, 1975)",CT,England,South East,76.4,1.22,Higher,NorthernHemisphere


In [13]:
# Calculating K-anonymity, 2 anonymous dataset
k = 2
############
# Checking k-anonymity, fix this for variable names
df_count = anon_data.groupby(['Gender', 'Birthyear', 'Education.Level', 'Region', 'Location.of.Birth']).size().reset_index(name = 'Count') 
print(df_count)
print(df_count[df_count['Count']==1])
# print(df_count.size().describe())

# print(anon_data.groupby('Education.Level').size())
# print(anon_data.groupby('Continent.of.Birth').size())
# print(anon_data.groupby('Region').size())

   Gender     Birthyear Education.Level   Region   Location.of.Birth  Count
0       F  [1955, 1975)      BasicOther  England               Error      1
1       F  [1955, 1975)      BasicOther  England  NorthernHemisphere     76
2       F  [1955, 1975)      BasicOther  England  SouthernHemisphere     37
3       F  [1955, 1975)      BasicOther    Other               Error      0
4       F  [1955, 1975)      BasicOther    Other  NorthernHemisphere     12
..    ...           ...             ...      ...                 ...    ...
67      M  [1995, 2015)          Higher  England  NorthernHemisphere     19
68      M  [1995, 2015)          Higher  England  SouthernHemisphere      5
69      M  [1995, 2015)          Higher    Other               Error      0
70      M  [1995, 2015)          Higher    Other  NorthernHemisphere      5
71      M  [1995, 2015)          Higher    Other  SouthernHemisphere      2

[72 rows x 6 columns]
   Gender     Birthyear Education.Level   Region Location.of.Birt