# Anonymising the customer_information.csv and calculating the k-anonymity of the new dataset

### Importing necessary packages
The following packages must be imported (and also installed, if necessary).

In [1]:
import pandas as pd
import numpy as np
import hashlib
import re
import os
import math
from geopy.geocoders import Nominatim
import country_converter as coco
from cryptography.fernet import Fernet


### Helper functions used
The following helper functions are needed - 


In [2]:
# Helper functions

# Parse country into shortform 
def parse_country(country_name):
    country = coco.convert(country_name, to='name_short', include_obsolete=True)
    return country

# The following variable countries were hard-coded to fix unmatched territory errors
northern_countries = ["Svalbard & Jan Mayen Islands"]
southern_countries = ["Micronesia"]

# Convert country of birth into Hemisphere (Northern or Southern) based on latitude coordinates
def country_to_hemisphere(country_name):
    try:  
        if country_name in southern_countries: 
            return "Southern Hemisphere" 
        elif country_name in northern_countries:
            return "Northern Hemisphere"
        else:
            return ("Southern" if Nominatim(user_agent="CDM").geocode(parse_country(country_name)).latitude < 0 else "Northern") + " Hemisphere"
    except Exception as e:
        print(e)
        return "Error"
    
# SHA hash function using a key and salt
def hash(to_hash, key):
    salt = os.urandom(16)
    h = hashlib.sha256()
    h.update(key)
    h.update(salt)
    h.update(to_hash.encode())
    return to_hash, h.hexdigest(), salt.hex()

# To encrypt and save as encrypted file; specify file to encrypt, encrypted file destination, and destination key location
def encrypt(to_encrypt, file_destination, key_location):
    key = Fernet.generate_key() # AES in CBC mode with a 128-bit key for encryption
    fernet = Fernet(key)
    
    with open(key_location, 'wb') as f:
        f.write(key)
    
    with open(to_encrypt, 'rb') as f:
        plaintext = f.read()   
    
    encrypted = fernet.encrypt(plaintext)
    with open(file_destination, 'wb') as e:
        e.write(encrypted)


### Loading required data and creating the anonymised dataframe

In [3]:
# Read in data to be anonymised
original_data = pd.read_csv("Data/customer_information.csv")

# Reading in postcode_region.csv to map given postcode to countries in the UK - 'England' and 'Other'(includes Wales, Scotland, Northern Ireland)
postcode_dictionary = pd.read_csv('Data/postcode_region.csv')

# Create anon_data variable as initial data with unneeded direct identifiers dropped
anon_data = pd.DataFrame()

postcode_dictionary.head()

Unnamed: 0,Postcode,Region
0,AB,Other
1,AL,England
2,B,England
3,BA,England
4,BB,England


### Adding variables to the anonymised dataset

Assigning gender and case-control status as given

In [4]:
# Assign gender
anon_data['Gender'] = original_data['gender']

# Assign case-control status
anon_data['CC.Status'] = original_data['cc_status']

anon_data.head()

Unnamed: 0,Gender,CC.Status
0,F,0
1,M,0
2,F,0
3,F,0
4,F,0


### Pseudoanonymisation

#### Creating the hashed Sample ID

Next, a unique Sample ID is created from the National Insurance Number to link the anonymised data with the reference data containing sensitive information.

In [5]:
# Clean NIN formatting and assign Sample ID as a hashed form of the NIN
key = os.urandom(16)
original_data["national_insurance_number"], anon_data['Sample.ID'], salts = zip(*original_data["national_insurance_number"].apply(
    lambda x: hash(re.sub(r'(.{2})(?!$)','\\1 ', x.replace(' ', '') ), key)))

anon_data.head()

Unnamed: 0,Gender,CC.Status,Sample.ID
0,F,0,e7dda4e3183992c4e7a06a4ad4d71aa390509c2529099b...
1,M,0,2b4a3ab95e8726e3423fb1748600fae45a95cb030883fc...
2,F,0,99172cd2213ea462c0c151b09def5d2a0c4e39c6034607...
3,F,0,b6c4148555a340dd984ae3bb26fcaef68911b3a82b6b82...
4,F,0,61f9f1173c13e4a97bddc8ba5ed61f2720b8239807a10e...


In [6]:
# Create a reference table between NIN and respective hashed NIN
reference_table = pd.DataFrame()
reference_table['Hashed.NIN'] = anon_data['Sample.ID']
reference_table['Salt'] = salts
reference_table['Key'] = key.hex()
reference_table['NIN'] = original_data['national_insurance_number']

reference_table.head()

Unnamed: 0,Hashed.NIN,Salt,Key,NIN
0,e7dda4e3183992c4e7a06a4ad4d71aa390509c2529099b...,dcfb8c0e10b13d78c66891c314d7a95f,328128ebbcb32539e595a209dbacf8a4,ZZ 19 48 92 T
1,2b4a3ab95e8726e3423fb1748600fae45a95cb030883fc...,d2c6e6a920c7d11234380ac15836fef1,328128ebbcb32539e595a209dbacf8a4,ZZ 75 35 13 T
2,99172cd2213ea462c0c151b09def5d2a0c4e39c6034607...,ded26ee52e94a71d74203afcf1783084,328128ebbcb32539e595a209dbacf8a4,ZZ 94 71 96 T
3,b6c4148555a340dd984ae3bb26fcaef68911b3a82b6b82...,43113483303783cce754df01ca533d7c,328128ebbcb32539e595a209dbacf8a4,ZZ 39 69 47 T
4,61f9f1173c13e4a97bddc8ba5ed61f2720b8239807a10e...,bf1f3a783d2a14d092bc12d12150d6f1,328128ebbcb32539e595a209dbacf8a4,ZZ 30 98 91 T


### Banding - date of birth and education level

In [7]:
# Banding birth date
birthyears = pd.DatetimeIndex(original_data['birthdate']).year

# Band the birth years into 20-year intervals
anon_data['Birthyear'] = pd.cut(birthyears, np.arange(birthyears.min(), birthyears.max()+20, 20), right=False)

anon_data.head()

Unnamed: 0,Gender,CC.Status,Sample.ID,Birthyear
0,F,0,e7dda4e3183992c4e7a06a4ad4d71aa390509c2529099b...,"[1975, 1995)"
1,M,0,2b4a3ab95e8726e3423fb1748600fae45a95cb030883fc...,"[1995, 2015)"
2,F,0,99172cd2213ea462c0c151b09def5d2a0c4e39c6034607...,"[1975, 1995)"
3,F,0,b6c4148555a340dd984ae3bb26fcaef68911b3a82b6b82...,"[1995, 2015)"
4,F,0,61f9f1173c13e4a97bddc8ba5ed61f2720b8239807a10e...,"[1955, 1975)"


### Mapping full postcode to countries within the UK using postcode_dictionary

In [8]:
# Assign UK country derived from postcode
anon_data['Postcode'] = original_data['postcode'].apply(lambda x: re.search('[a-zA-Z]*', x).group(0))
anon_data = pd.merge(anon_data, postcode_dictionary, on='Postcode', how='left')
anon_data = anon_data.rename(columns={'Region': 'UK.Country'})

anon_data.head()

Unnamed: 0,Gender,CC.Status,Sample.ID,Birthyear,Postcode,UK.Country
0,F,0,e7dda4e3183992c4e7a06a4ad4d71aa390509c2529099b...,"[1975, 1995)",LS,England
1,M,0,2b4a3ab95e8726e3423fb1748600fae45a95cb030883fc...,"[1995, 2015)",M,England
2,F,0,99172cd2213ea462c0c151b09def5d2a0c4e39c6034607...,"[1975, 1995)",SO,England
3,F,0,b6c4148555a340dd984ae3bb26fcaef68911b3a82b6b82...,"[1995, 2015)",B,England
4,F,0,61f9f1173c13e4a97bddc8ba5ed61f2720b8239807a10e...,"[1955, 1975)",TQ,England


### Data aggregation -  grouping education level and country of birth

In [9]:
# Assign education level as banded education level
anon_data['Education.Level'] = original_data['education_level'].map(lambda x: "Higher" if x in ["bachelor", "masters", "phD"] else "BasicOther")

# Assign hemisphere of birth depending on country of birth
anon_data['Location.of.Birth'] = original_data['country_of_birth'].apply(lambda x: country_to_hemisphere(x))

anon_data.head()

### Data pertubation - addition of Gaussian noise

In [None]:
# Add gaussian noise to weight, height, countries visited, average number of drinks in alcohol units per week and average cigrettes smoked per week.
weight_noise = np.random.normal(0,1,1000)*5
anon_data['Weight'] = round(original_data['weight']+weight_noise, 1)

height_noise = np.random.normal(0,1,1000)/5
anon_data['Height'] = round(original_data['height']+height_noise, 2)

countries_noise = np.random.normal(0,1,1000)*5
anon_data['Countries.Visited'] = round(original_data['n_countries_visited']+countries_noise)

alcohol_noise = np.random.normal(0,1,1000)
anon_data['Avg.Alcohol'] = round(original_data['avg_n_drinks_per_week']+alcohol_noise, 1)

smoking_noise = np.random.normal(0,1,1000)*20
anon_data['Avg.Cigarettes'] = round(original_data['avg_n_cigret_per_week']+smoking_noise)

anon_data.head()

Unnamed: 0,Sample.ID,Gender,Birthyear,Location.of.Birth,UK.Country,Weight,Height,Education.Level,Avg.Alcohol,Avg.Cigarettes,CC.Status,Countries.Visited
0,75ca3830e0e38b7e65d7569fb9e083907f015b3f3dcc0a...,F,"[1975, 1995)",Northern Hemisphere,England,79.3,1.68,Higher,6.1,240.0,0,51.0
1,d52f884b09b13bcb03b3635759fc799de27df9fda30890...,M,"[1995, 2015)",Northern Hemisphere,England,68.2,1.89,BasicOther,-1.0,26.0,0,38.0
2,bb09872073fa546a07d059a1fadbe1bb230ffd6c3d7508...,F,"[1975, 1995)",Northern Hemisphere,England,101.4,1.82,Higher,8.0,62.0,0,11.0
3,272881abd6f0d5a676e9a7e333bba3bea9d2496ceba9e7...,F,"[1995, 2015)",Northern Hemisphere,England,61.6,1.35,BasicOther,2.5,298.0,0,31.0
4,23386107274c492bc11df837ca29624703d4387b8673e5...,F,"[1955, 1975)",Southern Hemisphere,England,95.9,1.5,BasicOther,3.8,322.0,0,36.0


### Calculating K-anonymity using quasi-identifiers

The following code groups the quasi-identifiers specificied and returns a count of the "unique" rows. 

In [None]:
# Checking k-anonymity for quasi-identifiers
df_count = anon_data.groupby(['Gender', 'Birthyear', 'Location.of.Birth', 
                            'UK.Country', 'Education.Level']).size().reset_index(name = 'Count') 

# Print rows where k-anonymity is 1
print(df_count[df_count['Count']==1])

# Printing the final grouped output 
print("Final grouped output in ascending order of 'Count' - ")
df_count.sort_values("Count")

Empty DataFrame
Columns: [Gender, Birthyear, Location.of.Birth, UK.Country, Education.Level, Count]
Index: []
Final grouped output in ascending order of 'Count' - 


Unnamed: 0,Gender,Birthyear,Location.of.Birth,UK.Country,Education.Level,Count
22,F,"[1995, 2015)",Southern Hemisphere,Other,BasicOther,0
47,M,"[1995, 2015)",Southern Hemisphere,Other,Higher,2
31,M,"[1955, 1975)",Southern Hemisphere,Other,Higher,2
46,M,"[1995, 2015)",Southern Hemisphere,Other,BasicOther,2
19,F,"[1995, 2015)",Northern Hemisphere,Other,Higher,2
39,M,"[1975, 1995)",Southern Hemisphere,Other,Higher,3
23,F,"[1995, 2015)",Southern Hemisphere,Other,Higher,3
7,F,"[1955, 1975)",Southern Hemisphere,Other,Higher,4
30,M,"[1955, 1975)",Southern Hemisphere,Other,BasicOther,4
38,M,"[1975, 1995)",Southern Hemisphere,Other,BasicOther,4


### Viewing the final anonymised dataset

In [None]:
# Re-order columns
anon_data = anon_data[['Sample.ID', 'Gender', 'Birthyear', 'Location.of.Birth', 'UK.Country', 'Weight', 
                        'Height', 'Education.Level', 'Avg.Alcohol', 'Avg.Cigarettes', 'CC.Status', 'Countries.Visited']]

# View the anonymised dataset
anon_data.head()

Unnamed: 0,Sample.ID,Gender,Birthyear,Location.of.Birth,UK.Country,Weight,Height,Education.Level,Avg.Alcohol,Avg.Cigarettes,CC.Status
0,75ca3830e0e38b7e65d7569fb9e083907f015b3f3dcc0a...,F,"[1975, 1995)",Northern Hemisphere,England,79.3,1.68,Higher,6.1,240.0,0
1,d52f884b09b13bcb03b3635759fc799de27df9fda30890...,M,"[1995, 2015)",Northern Hemisphere,England,68.2,1.89,BasicOther,-1.0,26.0,0
2,bb09872073fa546a07d059a1fadbe1bb230ffd6c3d7508...,F,"[1975, 1995)",Northern Hemisphere,England,101.4,1.82,Higher,8.0,62.0,0
3,272881abd6f0d5a676e9a7e333bba3bea9d2496ceba9e7...,F,"[1995, 2015)",Northern Hemisphere,England,61.6,1.35,BasicOther,2.5,298.0,0
4,23386107274c492bc11df837ca29624703d4387b8673e5...,F,"[1955, 1975)",Southern Hemisphere,England,95.9,1.5,BasicOther,3.8,322.0,0


### Creating CSV files for the anonymised data and the reference table

In [None]:
# Output the files into .csv format
output_name = "anon_dataset"
anon_data.to_csv(output_name + ".csv", sep=",", index=None)

reference_table.to_csv("reference_table.csv", sep=",", index=None)

### Encrypting and decrypting the dataset

In [None]:
# Encrypt csv and delete original file
encrypt(output_name + ".csv", output_name + "_encrypted.csv", "key.key")
os.remove(output_name + ".csv")

In [None]:
pip freeze > requirements.txt

Note: you may need to restart the kernel to use updated packages.
