## Anonmymizing the customer_information.csv and calculating the k-anonymity of the new dataset

Importing necessary packages

In [73]:
import pandas as pd
import numpy as np
import hashlib
import re
import os
import math
from geopy.geocoders import Nominatim
import country_converter as coco
from functions import *

Loading the original dataset

In [80]:
# Read in data to be anonymised
data = pd.read_csv("Data/customer_information.csv")

# Declaring variables
postcode_dictionary = pd.read_csv('Data/postcode_region.csv')
northern_countries = ["Norway"]
southern_countries = ["Svalbard & Jan Mayen Islands"]

# Create anon_data variable as initial data with unneeded direct identifiers dropped
#anon_data = data.drop(['given_name', 'surname', 'phone_number', 'national_insurance_number', 'bank_account_number'], axis=1)
anon_data = pd.DataFrame()

In [75]:
# Clean NIN formatting and assign Sample ID as a hashed form of the NIN
key = os.urandom(16)
data["national_insurance_number"], anon_data['Sample.ID'], salts = zip(*data["national_insurance_number"].apply(
    lambda x: hash(key, re.sub(r'(.{2})(?!$)','\\1 ', x.replace(' ', '') ))))

In [76]:
# Create a reference table between NIN and respective hashed NIN
reference_table = pd.DataFrame()
reference_table['Hashed.NIN'] = anon_data['Sample.ID']
reference_table['Salt'] = salts
reference_table['Key'] = key.hex()
reference_table['NIN'] = data['national_insurance_number']

In [77]:
# Assign gender
anon_data['Gender'] = data['gender']

# Assign birthdate as banded birthyears
# Select the birth year only
birthyears = pd.DatetimeIndex(data['birthdate']).year
# Band the birth years into 5-year intervals
anon_data['Birthyear'] = pd.cut(birthyears, np.arange(birthyears.min(), birthyears.max()+20, 20), right=False)

# Assign UK country derived from postcode
anon_data['Postcode'] = data['postcode'].apply(lambda x: re.search('[a-zA-Z]*', x).group(0))
anon_data = pd.merge(anon_data, postcode_dictionary.iloc[: , :-1], on='Postcode', how='left')


In [78]:
# Assign weight and height as banded weights and heights
#anon_data['Weight'] = pd.cut(data['weight'], np.arange(math.floor(data['weight'].min()), math.floor(data['weight'].max()+20), 20), right=False)
weight_noise = np.random.normal(0,1,1000)*5
anon_data['Weight'] = round(data['weight']+weight_noise, 1)


In [79]:
# Round minimum and maximum heights to nearest one-fifth prior to making bins
height_noise = np.random.normal(0,1,1000)/5
anon_data['Height'] = round(data['height']+height_noise, 2)
bmi = data['weight'] / data['height']**2


In [27]:
def parse_country(country_name):
    country = coco.convert(country_name, to='name_short', include_obsolete=True)
    return country

def country_to_hemisphere2(country_name):
    clean_country = parse_country(country_name)
    try:  
        if country_name in southern_countries: # Hard-coded fix for unmatched territories
            return "Southern Hemisphere" 
        elif country_name in northern_countries:
            "Northern Hemisphere"
        else:
            return "Southern" if Nominatim(user_agent="CDM").geocode((parse_country(country_name))).latitude < 0 else "Northern" + " Hemisphere"
    except:
        print("Input:",country_name,", output:", "Error")
        return "Error"
    
# Assign education level as banded education level
anon_data['Education.Level'] = data['education_level'].map(lambda x: "Higher" if x in ["bachelor", "masters", "phD"] else "BasicOther")

anon_data['Location.of.Birth'] = data['country_of_birth'].apply(lambda x: country_to_hemisphere2(x))

# Output the files
anon_data.to_csv("output.csv", sep=",", index=None)
reference_table.to_csv("reference_table.csv", sep=",", index=None)

Input: Micronesia , output: Error


In [61]:
#anon_data['Countries.Visited'] = pd.cut(data['n_countries_visited'], np.arange(data['n_countries_visited'].min(), data['n_countries_visited'].max()+30, 30), right=False)

countries_noise = np.random.normal(0,1,1000)*5
anon_data['Countries.Visited'] = round(data['n_countries_visited']+countries_noise)
alcohol_noise = np.random.normal(0,1,1000)*5
anon_data['Avg.Alcohol'] = round(data['avg_n_drinks_per_week']+alcohol_noise)
anon_data['Avg.Alcohol.True'] = data['avg_n_drinks_per_week']

smoking_noise = np.random.normal(0,1,1000)*10
anon_data['Avg.Cigarettes'] = round(data['avg_n_cigret_per_week']+smoking_noise)
anon_data['Avg.Cigarettes.True'] = data['avg_n_cigret_per_week']

anon_data


Unnamed: 0,Sample.ID,Gender,Birthyear,Postcode,Region,Region.1,Weight,Height,Education.Level,Location.of.Birth,Countries.Visited,Avg.Alcohol,Avg.Cigarettes,Avg.Cigarettes.True,Avg.Alcohol.True
0,86dc5b83e5336f539043a6411e4812abd9cf1f2d6ba266...,F,"[1975, 1995)",LS,England,North East,76.7,1.78,Higher,Northern Hemisphere,66.0,4.0,205.0,218.8,6.5
1,701dc8acfc808dd47c4a49a734561685c4a54e5b765885...,M,"[1995, 2015)",M,England,North West,62.5,1.36,BasicOther,Northern Hemisphere,47.0,3.0,50.0,43.6,0.7
2,a9de10ea50c1f274761304b1807baa18f2f02d5c3849d4...,F,"[1975, 1995)",SO,England,South East,101.0,1.98,Higher,Northern Hemisphere,11.0,10.0,68.0,59.1,7.8
3,98fde3f4f52e6be6688a00776a69dbbbb1683e2380236a...,F,"[1995, 2015)",B,England,West Midlands,57.5,1.41,BasicOther,Northern Hemisphere,31.0,4.0,300.0,284.2,4.6
4,cf6ecece0617faacc5d9966015fd5da67ab4939e007c90...,F,"[1955, 1975)",TQ,England,South West,92.9,1.45,BasicOther,Southern Hemisphere,34.0,5.0,334.0,348.8,4.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,f00999557e70e5279a69048ba331bdc68d03ec08b40431...,M,"[1955, 1975)",SA,Other,Wales,97.7,1.71,BasicOther,Northern Hemisphere,13.0,2.0,255.0,262.4,1.8
996,5fb67daf71702a91fad102390bc94c22bcad88c86f6feb...,M,"[1995, 2015)",TS,England,North East,56.5,1.79,BasicOther,Northern Hemisphere,33.0,13.0,333.0,336.2,7.7
997,fffaad19de06d97d7822cc42965f00d05733862130e8bd...,F,"[1955, 1975)",G,Other,Scotland,89.9,1.99,BasicOther,Southern Hemisphere,31.0,-1.0,68.0,55.7,0.9
998,eb85b796391facdb61d959af11e5c4cd13268362641f1f...,F,"[1955, 1975)",CT,England,South East,76.1,1.31,Higher,Northern Hemisphere,33.0,-3.0,418.0,430.5,4.7


In [55]:
# Calculating K-anonymity, 2 anonymous dataset
k = 2
############
# Checking k-anonymity, fix this for variable names
#df_count = anon_data.groupby(['Gender', 'Birthyear', 'Region', 'Location.of.Birth', 'ht wt' 'Education.Level', 'Countries.Visited' 'avg-alc' 'avg-sn' 'cc']).size().reset_index(name = 'Count') 
df_count = anon_data.groupby(['Gender', 'Birthyear', 'Location.of.Birth', 'Region', 'Education.Level', 'Countries.Visited']).size().reset_index(name = 'Count') 
#print(df_count)
print(df_count[df_count['Count']==1])
# print(df_count.size().describe())

# print(anon_data.groupby('Education.Level').size())
# print(anon_data.groupby('Continent.of.Birth').size())
# print(anon_data.groupby('Region').size())

     Gender     Birthyear    Location.of.Birth   Region Education.Level  \
278       F  [1955, 1975)  Northern Hemisphere  England      BasicOther   
282       F  [1955, 1975)  Northern Hemisphere  England      BasicOther   
283       F  [1955, 1975)  Northern Hemisphere  England      BasicOther   
289       F  [1955, 1975)  Northern Hemisphere  England      BasicOther   
290       F  [1955, 1975)  Northern Hemisphere  England      BasicOther   
...     ...           ...                  ...      ...             ...   
4744      M  [1995, 2015)  Southern Hemisphere  England          Higher   
4779      M  [1995, 2015)  Southern Hemisphere    Other      BasicOther   
4794      M  [1995, 2015)  Southern Hemisphere    Other      BasicOther   
4832      M  [1995, 2015)  Southern Hemisphere    Other          Higher   
4851      M  [1995, 2015)  Southern Hemisphere    Other          Higher   

      Countries.Visited  Count  
278                -2.0      1  
282                 2.0      1  
