## Anonmymizing the customer_information.csv and calculating the k-anonymity of the new dataset

Importing necessary packages

In [277]:
import pandas as pd
import numpy as np
import hashlib
import re
import os
import math
from geopy.geocoders import Nominatim
import country_converter as coco
from functions import *

Loading the original dataset

In [254]:
# Read in data to be anonymised
data = pd.read_csv("Data/customer_information.csv")

# Declaring variables
postcode_dictionary = pd.read_csv('Data/postcode_region.csv')
northern_countries = ["Svalbard & Jan Mayen Islands"]
southern_countries = ["Micronesia"]

# Create anon_data variable as initial data with unneeded direct identifiers dropped
#anon_data = data.drop(['given_name', 'surname', 'phone_number', 'national_insurance_number', 'bank_account_number'], axis=1)
anon_data = pd.DataFrame()

In [255]:
# Clean NIN formatting and assign Sample ID as a hashed form of the NIN
key = os.urandom(16)
data["national_insurance_number"], anon_data['Sample.ID'], salts = zip(*data["national_insurance_number"].apply(
    lambda x: hash(key, re.sub(r'(.{2})(?!$)','\\1 ', x.replace(' ', '') ))))

In [256]:
# Create a reference table between NIN and respective hashed NIN
reference_table = pd.DataFrame()
reference_table['Hashed.NIN'] = anon_data['Sample.ID']
reference_table['Salt'] = salts
reference_table['Key'] = key.hex()
reference_table['NIN'] = data['national_insurance_number']

In [247]:
# Assign gender
anon_data['Gender'] = data['gender']


In [248]:
# Banding birth date
birthyears = pd.DatetimeIndex(data['birthdate']).year
# Band the birth years into 5-year intervals
anon_data['Birthyear'] = pd.cut(birthyears, np.arange(birthyears.min(), birthyears.max()+20, 20), right=False)

In [249]:
# Grouping education level, postcode, and country of birth

# Assign education level as banded education level
#from tkinter import E
anon_data['Education.Level'] = data['education_level'].map(lambda x: "Higher" if x in ["bachelor", "masters", "phD"] else "BasicOther")

# Assign UK country derived from postcode
anon_data['Postcode'] = data['postcode'].apply(lambda x: re.search('[a-zA-Z]*', x).group(0))
anon_data = pd.merge(anon_data, postcode_dictionary, on='Postcode', how='left')
anon_data = anon_data.rename(columns={'Region': 'UK.Region'})

# Assign hemisphere of birth depending on country of birth

def parse_country(country_name):
    country = coco.convert(country_name, to='name_short', include_obsolete=True)
    return country

def country_to_hemisphere2(country_name):
    try:  
        if country_name in southern_countries: # Hard-coded fix for unmatched territories
            return "Southern Hemisphere" 
        elif country_name in northern_countries:
            return "Northern Hemisphere"
        else:
            return ("Southern" if Nominatim(user_agent="CDM").geocode(parse_country(country_name)).latitude < 0 else "Northern") + " Hemisphere"
    except Exception as e:
        print(e)
        return "Error"

anon_data['Location.of.Birth'] = data['country_of_birth'].apply(lambda x: country_to_hemisphere2(x))

In [250]:
# Add gaussian noise to weight/height, countries visited, and alcohol/smoking history
weight_noise = np.random.normal(0,1,1000)*5
anon_data['Weight'] = round(data['weight']+weight_noise, 1)

height_noise = np.random.normal(0,1,1000)/5
anon_data['Height'] = round(data['height']+height_noise, 2)
bmi = data['weight'] / data['height']**2

countries_noise = np.random.normal(0,1,1000)*5
anon_data['Countries.Visited'] = round(data['n_countries_visited']+countries_noise)

alcohol_noise = np.random.normal(0,1,1000)
anon_data['Avg.Alcohol'] = round(data['avg_n_drinks_per_week']+alcohol_noise, 1)

smoking_noise = np.random.normal(0,1,1000)*20
anon_data['Avg.Cigarettes'] = round(data['avg_n_cigret_per_week']+smoking_noise)

In [251]:
# Attach case-control status
anon_data['CC.Status'] = data['cc_status']

# Re-order columns
anon_data = anon_data[['Sample.ID', 'Gender', 'Birthyear', 'Location.of.Birth', 'UK.Region', 'Weight', 'Height', 'Education.Level', 'Avg.Alcohol', 'Avg.Cigarettes', 'CC.Status']]

# View the anonymised dataset
anon_data

# Output the files
anon_data.to_csv("output.csv", sep=",", index=None)
reference_table.to_csv("reference_table.csv", sep=",", index=None)

In [252]:
# Calculating K-anonymity

# Checking k-anonymity for quasi-identifiers
df_count = anon_data.groupby(['Gender', 'Birthyear', 'Location.of.Birth', 'UK.Region', 'Education.Level']).size().reset_index(name = 'Count') 

# Print those not meeting our specified k-anonymity level
print(df_count[df_count['Count']==1])

Empty DataFrame
Columns: [Gender, Birthyear, Location.of.Birth, UK.Region, Education.Level, Count]
Index: []


In [290]:
key = os.urandom(16)
cnin, hnin, salts = hash(key, re.sub(r'(.{2})(?!$)','\\1 ', "ZZ 19 48 92 T".replace(' ', '') ))

print(cnin)
print(hnin) #hashed val
print(salts) #salt
print(key)
print(key.hex()) #key
print(bytes.fromhex(key.hex()))


#reference_table.to_csv("reference_table.csv", sep=",", index=None)
key = bytes.fromhex('621253bce31095abc85566ad4c5e9f75')
salt = bytes.fromhex('b5bf8c87675fd09d3d618b246e0410cf')

hashed = hashlib.sha256(key + salt + "ZZ 19 48 92 T".encode()).hexdigest()
# Should make d5bb9cd05c73ea8aae516a477037e712137566538b13ebb1e9dae2364c02ecb1

hashed


ZZ 19 48 92 T
c39422b6c493d5aeb0506a69396b34a415dcbb1b28a27661723a985f6936a25c
f4307052ba3ac64c31769876849d6330
b'\xc88\xb6\n\xff\xaf\xba\xc6\x9e\xe9\x92\x1e\xd6{\x95\xdb'
c838b60affafbac69ee9921ed67b95db
b'\xc88\xb6\n\xff\xaf\xba\xc6\x9e\xe9\x92\x1e\xd6{\x95\xdb'


TypeError: can only concatenate str (not "bytes") to str