## Anonmymizing the customer_information.csv and calculating the k-anonymity of the new dataset

Importing necessary packages

In [338]:
import pandas as pd
import numpy as np
import hashlib
import re
import os
import math
from geopy.geocoders import Nominatim
import country_converter as coco
from cryptography.fernet import Fernet

In [339]:
#
# Helper functions
#

northern_countries = ["Svalbard & Jan Mayen Islands"]
southern_countries = ["Micronesia"]

# Parse country into more workeable format
def parse_country(country_name):
    country = coco.convert(country_name, to='name_short', include_obsolete=True)
    return country

def country_to_hemisphere(country_name):
    try:  
        if country_name in southern_countries: # Hard-coded fix for unmatched territories
            return "Southern Hemisphere" 
        elif country_name in northern_countries:
            return "Northern Hemisphere"
        else:
            return ("Southern" if Nominatim(user_agent="CDM").geocode(parse_country(country_name)).latitude < 0 else "Northern") + " Hemisphere"
    except Exception as e:
        print(e)
        return "Error"
    
# SHA hash using key and salt
def hash(to_hash, key):
    salt = os.urandom(16)
    h = hashlib.sha256()
    h.update(key)
    h.update(salt)
    h.update(to_hash.encode())
    return to_hash, h.hexdigest(), salt.hex()

# Encrypt and save as encrypted file; specify file to encrypt, encrypted file destination, and destination key location
def encrypt(to_encrypt, file_destination, key_location):
    key = Fernet.generate_key() # AES in CBC mode with a 128-bit key for encryption
    fernet = Fernet(key)
    
    with open(key_location, 'wb') as f:
        f.write(key)
    
    with open(to_encrypt, 'rb') as f:
        plaintext = f.read()   
    
    encrypted = fernet.encrypt(plaintext)
    with open(file_destination, 'wb') as e:
        e.write(encrypted)

# Decrypt and save as plaintext file; specify file to decrypt, decrypted file destination, and key location
def decrypt(to_decrypt, file_destination, key_location):
    with open(key_location, 'rb') as f:
        key = f.read()
        
    fernet = Fernet(key)

    with open(to_decrypt, 'rb') as f:
        encrypted = f.read()

    decrypted = fernet.decrypt(encrypted)
    with open(file_destination, 'wb') as f:
        f.write(decrypted)


Loading the original dataset

In [315]:
# Read in data to be anonymised
data = pd.read_csv("Data/customer_information.csv")

# Declaring variables
postcode_dictionary = pd.read_csv('Data/postcode_region.csv')

# Create anon_data variable as initial data with unneeded direct identifiers dropped
#anon_data = data.drop(['given_name', 'surname', 'phone_number', 'national_insurance_number', 'bank_account_number'], axis=1)
anon_data = pd.DataFrame()

In [316]:
# Clean NIN formatting and assign Sample ID as a hashed form of the NIN
key = os.urandom(16)
data["national_insurance_number"], anon_data['Sample.ID'], salts = zip(*data["national_insurance_number"].apply(
    lambda x: hash(re.sub(r'(.{2})(?!$)','\\1 ', x.replace(' ', '') ), key)))

In [317]:
# Create a reference table between NIN and respective hashed NIN
reference_table = pd.DataFrame()
reference_table['Hashed.NIN'] = anon_data['Sample.ID']
reference_table['Salt'] = salts
reference_table['Key'] = key.hex()
reference_table['NIN'] = data['national_insurance_number']

In [318]:
# Assign gender
anon_data['Gender'] = data['gender']


In [319]:
# Banding birth date
birthyears = pd.DatetimeIndex(data['birthdate']).year
# Band the birth years into 5-year intervals
anon_data['Birthyear'] = pd.cut(birthyears, np.arange(birthyears.min(), birthyears.max()+20, 20), right=False)

In [320]:
# Grouping education level, postcode, and country of birth

# Assign education level as banded education level
anon_data['Education.Level'] = data['education_level'].map(lambda x: "Higher" if x in ["bachelor", "masters", "phD"] else "BasicOther")

# Assign UK country derived from postcode
anon_data['Postcode'] = data['postcode'].apply(lambda x: re.search('[a-zA-Z]*', x).group(0))
anon_data = pd.merge(anon_data, postcode_dictionary, on='Postcode', how='left')
anon_data = anon_data.rename(columns={'Region': 'UK.Region'})

# Assign hemisphere of birth depending on country of birth
anon_data['Location.of.Birth'] = data['country_of_birth'].apply(lambda x: country_to_hemisphere(x))

In [321]:
# Add gaussian noise to weight/height, countries visited, and alcohol/smoking history
weight_noise = np.random.normal(0,1,1000)*5
anon_data['Weight'] = round(data['weight']+weight_noise, 1)

height_noise = np.random.normal(0,1,1000)/5
anon_data['Height'] = round(data['height']+height_noise, 2)
bmi = data['weight'] / data['height']**2

countries_noise = np.random.normal(0,1,1000)*5
anon_data['Countries.Visited'] = round(data['n_countries_visited']+countries_noise)

alcohol_noise = np.random.normal(0,1,1000)
anon_data['Avg.Alcohol'] = round(data['avg_n_drinks_per_week']+alcohol_noise, 1)

smoking_noise = np.random.normal(0,1,1000)*20
anon_data['Avg.Cigarettes'] = round(data['avg_n_cigret_per_week']+smoking_noise)

In [336]:
# Attach case-control status
anon_data['CC.Status'] = data['cc_status']

# Re-order columns
anon_data = anon_data[['Sample.ID', 'Gender', 'Birthyear', 'Location.of.Birth', 'UK.Region', 'Weight', 'Height', 'Education.Level', 'Avg.Alcohol', 'Avg.Cigarettes', 'CC.Status']]

# View the anonymised dataset
anon_data

# Output the files
output_name = "output"
anon_data.to_csv(output_name + ".csv", sep=",", index=None)

# Encrypt csv and delete original file
encrypt(output_name + ".csv", output_name + "_encrypted.csv", "key.key")
os.remove(output_name + ".csv")

# Decrypt file
decrypt(output_name + "_encrypted.csv", "decrypted.csv", "key.key")

reference_table.to_csv("reference_table.csv", sep=",", index=None)

In [337]:
# Calculating K-anonymity

# Checking k-anonymity for quasi-identifiers
df_count = anon_data.groupby(['Gender', 'Birthyear', 'Location.of.Birth', 'UK.Region', 'Education.Level']).size().reset_index(name = 'Count') 

# Print those not meeting our specified k-anonymity level
print(df_count[df_count['Count']==1])

Empty DataFrame
Columns: [Gender, Birthyear, Location.of.Birth, UK.Region, Education.Level, Count]
Index: []
