# Anonymizing Data Project


In [2]:
import pandas as pd
import numpy as np
import scipy.stats
import datetime
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import LabelEncoder
# get rid of warnings
import warnings
warnings.filterwarnings("ignore")
# get more than one output per Jupyter cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# for functions we implement later
#from utils import plot_result
import scipy.stats as st
from scipy.stats import ncf




### Function for Perturbation

In [3]:
# function from https://github.com/r0f1/dev_to_posts/blob/master/fake_data/utils.py 
#Florian Rohrer


def best_fit_distribution(data, bins=200, ax=None):
    """Model data by finding best fit distribution to data"""
    # Get histogram of original data
    y, x = np.histogram(data, bins=bins, density=True)
    x = (x + np.roll(x, -1))[:-1] / 2.0

    # Distributions to check
    DISTRIBUTIONS = [        
        st.alpha,st.anglit,st.arcsine,st.beta,st.betaprime,st.bradford,st.burr,st.cauchy,st.chi,st.chi2,st.cosine,
        st.dgamma,st.dweibull,st.erlang,st.expon,st.exponnorm,st.exponweib,st.exponpow,st.f,st.fatiguelife,st.fisk,
        st.foldcauchy,st.foldnorm,st.frechet_r,st.frechet_l,st.genlogistic,st.genpareto,st.gennorm,st.genexpon,
        st.genextreme,st.gausshyper,st.gamma,st.gengamma,st.genhalflogistic,st.gilbrat,st.gompertz,st.gumbel_r,
        st.gumbel_l,st.halfcauchy,st.halflogistic,st.halfnorm,st.halfgennorm,st.hypsecant,st.invgamma,st.invgauss,
        st.invweibull,st.johnsonsb,st.johnsonsu,st.ksone,st.kstwobign,st.laplace,st.levy,st.levy_l,st.levy_stable,
        st.logistic,st.loggamma,st.loglaplace,st.lognorm,st.lomax,st.maxwell,st.mielke,st.nakagami,st.ncx2,st.ncf,
        st.nct,st.norm,st.pareto,st.pearson3,st.powerlaw,st.powerlognorm,st.powernorm,st.rdist,st.reciprocal,
        st.rayleigh,st.rice,st.recipinvgauss,st.semicircular,st.t,st.triang,st.truncexpon,st.truncnorm,st.tukeylambda,
        st.uniform,st.vonmises,st.vonmises_line,st.wald,st.weibull_min,st.weibull_max,st.wrapcauchy
    ]

    # Best holders
    best_distribution = st.norm
    best_params = (0.0, 1.0)
    best_sse = np.inf

    # Estimate distribution parameters from data
    for idx, distribution in enumerate(DISTRIBUTIONS):

        # Try to fit the distribution
        try:
            # Ignore warnings from data that can't be fit
            with warnings.catch_warnings():
                warnings.filterwarnings('ignore')

                # fit dist to data
                params = distribution.fit(data)

                # Separate parts of parameters
                arg = params[:-2]
                loc = params[-2]
                scale = params[-1]

                # Calculate fitted PDF and error with fit in distribution
                pdf = distribution.pdf(x, loc=loc, scale=scale, *arg)
                sse = np.sum(np.power(y - pdf, 2.0))

                # if axis pass in add to plot
                try:
                    if ax:
                        pd.Series(pdf, x).plot(ax=ax)
                except Exception:
                    pass

                # identify if this distribution is better
                if best_sse > sse > 0:
                    best_distribution = distribution
                    best_params = params
                    best_sse = sse

        except Exception:
            pass
        
    return (best_distribution.name, best_params)

In [4]:
# Function to generate pertebation data based on the above function

def generate_like_df(df, categorical_cols, continuous_cols, best_distributions_name, best_distribution_parm, n, seed=0):
    np.random.seed(seed)
    d = {}

    for c in categorical_cols:
        counts = df[c].value_counts()
        d[c] = np.random.choice(list(counts.index), p=(counts/len(df)).values, size=n)
 
    for c, bdn, bdp in zip(continuous_cols, best_distributions_name,best_distribution_parm):

        dist =  best_distributions_name

        d[c] = make_pdf(ncf,best_fit_params,n)
       


    return pd.DataFrame(d)#columns=continuous_cols+categorical_cols

In [5]:
def make_pdf(dist, params, size=10000):
    """Generate distributions's Propbability Distribution Function """

    # Separate parts of parameters
    arg = params[:-2]
    loc = params[-2]
    scale = params[-1]

    # Get sane start and end points of distribution
    start = dist.ppf(0.01, *arg, loc=loc, scale=scale) if arg else dist.ppf(0.01, loc=loc, scale=scale)
    end = dist.ppf(0.99, *arg, loc=loc, scale=scale) if arg else dist.ppf(0.99, loc=loc, scale=scale)

    # Build PDF and turn into pandas Series
    x = np.linspace(start, end, size)
    y = dist.pdf(x, loc=loc, scale=scale, *arg)
    pdf = pd.Series(x)

    return pdf

### Anonymizing Data Process 

In [6]:

#import file into DF
Employee_data_DF = pd.read_csv('https://raw.githubusercontent.com/skhayden/MSDS-7349-Sec403-/master/Class_Project/Employee_data.csv',low_memory=False)
#Remove unneeded columns
Employee_data_DF.drop(['Job.1','Unnamed: 0'], axis = 1, inplace = True) 
Employee_data_DF.head()

Unnamed: 0,First Name,Last Name,Date of Birth,Job,Address,Salary,SSN,Employee_Id,Office_Location,Building_number,Phone_Number,Email
0,Matthew,Johnson,4/11/1993,Research scientist (physical sciences),USS Walker\nFPO AA 91930,64822,691-17-3378,P37429,West Andrew,961,3298685475,Matthewgmail.com
1,Elizabeth,Frazier,6/17/1989,"Therapist, speech and language","8283 Kimberly Streets Apt. 001\nEast Edward, D...",37351,675-40-3501,P90953,South Tammymouth,590,524-601-7567x7090,EFrazier93erez.com
2,Lori,Vargas,8/24/1959,"Designer, furniture","235 Devin Ridges\nCraneshire, OR 16065",74379,821-81-4954,P38017,West Arthur,198,316-095-9037x69942,Loriegregory@smith.com
3,Sarah,Espinoza,12/1/1983,Water engineer,"78762 Eileen Camp\nNelsonhaven, ID 36069",81275,124-10-4970,P91959,Greerborough,10033,+1-209-595-5257x826,SEspinozad-perry.com
4,Melissa,Hunter,9/24/1949,Agricultural engineer,"3346 Anderson Lakes\nShawville, IL 85803",35409,789-99-6633,P36920,Davisburgh,633,44421409,Melissazkimberly@wiley.com


In [7]:
# defines two types of columns; categorical and continuous 
#https://dev.to/r0f1/a-simple-way-to-anonymize-data-with-python-and-pandas-79g
#del categorical_cols
categorical_cols = []

#del Perterbation_col
Perterbation_col = []

#Perterbation_col.append(Employee_data_DF['Salary'])   


for c in list(Employee_data_DF):
    #col = df[c]
    if (c != 'Salary' and c != 'Date of Birth'):
        categorical_cols.append(c)
        
    elif (c != 'Date of Birth'):
        
        Perterbation_col.append(c)

        
categorical_cols.append('Age')       






##### Suppresion and Bucketization columns

In [8]:
# suppress columns 
Suppresrion_col = ['Last Name','SSN','Employee_Id']
for c in categorical_cols:
    if c in Suppresrion_col:
        Employee_data_DF[c] = '#####'
        
Employee_data_DF.head()
    

Unnamed: 0,First Name,Last Name,Date of Birth,Job,Address,Salary,SSN,Employee_Id,Office_Location,Building_number,Phone_Number,Email
0,Matthew,#####,4/11/1993,Research scientist (physical sciences),USS Walker\nFPO AA 91930,64822,#####,#####,West Andrew,961,3298685475,Matthewgmail.com
1,Elizabeth,#####,6/17/1989,"Therapist, speech and language","8283 Kimberly Streets Apt. 001\nEast Edward, D...",37351,#####,#####,South Tammymouth,590,524-601-7567x7090,EFrazier93erez.com
2,Lori,#####,8/24/1959,"Designer, furniture","235 Devin Ridges\nCraneshire, OR 16065",74379,#####,#####,West Arthur,198,316-095-9037x69942,Loriegregory@smith.com
3,Sarah,#####,12/1/1983,Water engineer,"78762 Eileen Camp\nNelsonhaven, ID 36069",81275,#####,#####,Greerborough,10033,+1-209-595-5257x826,SEspinozad-perry.com
4,Melissa,#####,9/24/1949,Agricultural engineer,"3346 Anderson Lakes\nShawville, IL 85803",35409,#####,#####,Davisburgh,633,44421409,Melissazkimberly@wiley.com


In [9]:
#Bucketization of Date of Birth by turning it into age

Employee_data_DF['Date of Birth'] = pd.to_datetime(Employee_data_DF['Date of Birth'])

Employee_data_DF['Age'] = ((datetime.datetime.now() - Employee_data_DF['Date of Birth'])/365.25).dt.days
#drop DOB column
Employee_data_DF.drop(['Date of Birth'] , axis = 1, inplace = True) 



#Rearrange columns 
Employee_data_DF=Employee_data_DF[['First Name', 'Last Name','Age', 'Job', 'Address', 'Salary', 'SSN',
       'Employee_Id', 'Office_Location', 'Building_number', 'Phone_Number',
       'Email' ]]



Employee_data_DF.head()

Unnamed: 0,First Name,Last Name,Age,Job,Address,Salary,SSN,Employee_Id,Office_Location,Building_number,Phone_Number,Email
0,Matthew,#####,25,Research scientist (physical sciences),USS Walker\nFPO AA 91930,64822,#####,#####,West Andrew,961,3298685475,Matthewgmail.com
1,Elizabeth,#####,29,"Therapist, speech and language","8283 Kimberly Streets Apt. 001\nEast Edward, D...",37351,#####,#####,South Tammymouth,590,524-601-7567x7090,EFrazier93erez.com
2,Lori,#####,59,"Designer, furniture","235 Devin Ridges\nCraneshire, OR 16065",74379,#####,#####,West Arthur,198,316-095-9037x69942,Loriegregory@smith.com
3,Sarah,#####,35,Water engineer,"78762 Eileen Camp\nNelsonhaven, ID 36069",81275,#####,#####,Greerborough,10033,+1-209-595-5257x826,SEspinozad-perry.com
4,Melissa,#####,69,Agricultural engineer,"3346 Anderson Lakes\nShawville, IL 85803",35409,#####,#####,Davisburgh,633,44421409,Melissazkimberly@wiley.com


##### Perturbation columns

In [10]:
best_distributions = []
for c in Perterbation_col:
    data = Employee_data_DF[c]
    best_fit_name, best_fit_params = best_fit_distribution(data, 50)
    best_distributions.append((best_fit_name, best_fit_params))

In [14]:
gendf = generate_like_df(Employee_data_DF, categorical_cols, Perterbation_col, best_fit_name ,best_fit_params, n= len(Employee_data_DF))
gendf.shape
gendf.Salary = gendf.Salary.round(2)



(20591, 12)

#### Anonymized Data

In [15]:
gendf.head()

Unnamed: 0,First Name,Last Name,Job,Address,SSN,Employee_Id,Office_Location,Building_number,Phone_Number,Email,Age,Salary
0,Katherine,#####,Commercial horticulturist,"3666 Huber Ports\nNorth Andrewland, NE 40103",#####,#####,Lake Eric,2531,+1-670-378-8963x88503,Brittanyhilip@smith.net,32,29682.08
1,Bradley,#####,"Engineer, civil (consulting)","9487 Parker Circle Suite 740\nEast Joseph, AZ ...",#####,#####,Lake Jesus,45079,(048)059-6528x2643,KWolflson.com,60,29687.2
2,Maria,#####,Nutritional therapist,"2725 Theresa Vista\nLauriechester, RI 76918",#####,#####,East Richard,4344,(453)559-7454x5643,PMartingmail.com,70,29692.32
3,Brenda,#####,Psychiatrist,"07009 Patricia Forges Suite 468\nAndreabury, O...",#####,#####,South Crystalfurt,9519,886.588.5883x53426,Brandonrs.com,62,29697.44
4,Samantha,#####,Software engineer,"1902 Madison Crescent Suite 676\nSouth Mary, A...",#####,#####,Melissachester,44693,515.474.8395x9622,Carrie52her52@yahoo.com,72,29702.57


In [16]:
# save a local file
gendf.to_csv(r'C:\Users\shayden\Documents\SMU\Data and Network Security\MSDS-7349-Sec403-\Class_Project\Gen_Employee_data.csv')