To do:

Replace each unique value with the same random text

In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
from pandas.api.types import is_datetime64_any_dtype, is_numeric_dtype, is_integer_dtype, is_object_dtype, is_string_dtype

In [2]:
def delete_files(folder):
    file_generator = folder.glob('**/*')
    file_list = list(file_generator)
    for file in file_list:
        file.unlink()
    return list(file_list)


def move_files(samples_folder, file_to_copy, destination_folder):
    (destination_folder / file_to_copy).write_bytes((samples_folder / file_to_copy).read_bytes())
    return file_to_copy


def convert_to_numeric_and_date(df, dayfirst=True):
    for column in df.columns:
        if is_object_dtype(df[column]) or is_string_dtype(df[column]):
            try:
                df[column] = pd.to_numeric(df[column], downcast='integer')
            except:
                try:
                    df[column] = df[column].str.replace('$', '')
                    df[column] = df[column].str.replace(',', '')
                    df[column] = pd.to_numeric(df[column])
                except:
                    try:
                        df[column] = pd.to_datetime(df[column], dayfirst=dayfirst)
                    except:
                        pass
    return df


def random_dates(start, end, seed=1, replace=True, number_of_rows=100):
    dates = pd.date_range(start, end).to_series()
    return dates.sample(number_of_rows, replace=replace, random_state=seed).index
    
    
def dataframe_obfuscator(df, number_of_rows=100):
    for column in df.columns:
        if is_datetime64_any_dtype(df[column]):
            df[column] = random_dates(min(df[column]),max(df[column]), seed=1)
        elif is_integer_dtype(df[column]):
            df[column] = df[column].fillna(0)
            if min(df[column]) < max(df[column]):
                df[column] = np.random.randint(min(df[column]),max(df[column]),size=(number_of_rows))
            else:
                df[column] = min(df[column])
        elif is_numeric_dtype(df[column]):
            df[column] = df[column].fillna(0)
            df[column] = np.random.uniform(min(df[column]),max(df[column]),size=(number_of_rows))
        else:
            df[column] = 'random text'
    return df


def obfuscate_csv(data_file, dayfirst=True, number_of_rows=100):
    df = pd.read_csv(data_file, nrows=number_of_rows)
    df = convert_to_numeric_and_date(df)
    df = dataframe_obfuscator(df)
    df.to_csv(data_file, header=True, index=False)
    return df


def obfuscate_excel(data_file, dayfirst=True, number_of_rows=100):
    df = pd.read_excel(data_file, nrows=number_of_rows)
    display(df.head())
    df = convert_to_numeric_and_date(df)
    df = dataframe_obfuscator(df)
    df.to_excel(data_file, header=True, index=False)
    return df

In [3]:
p = Path.cwd()
obfuscate_csv(p / 'data' / 'data.csv', dayfirst=True, number_of_rows=100)

Unnamed: 0,Purchase_Order_Number_Combined,District_Code,Warehouse_Identification,FPA_Group,FPA_Agreement_Number,FPA_Item_Number,Supplier_Price_Code1,UOI_Original_Quantity,UOI_Original_Net_Price1,UOI_Current_Quantity1,...,Total,Stock_Code,Item_Name_line,Stock_Description,Employee_Identification,FullName,Purchase_Order_Date,Supplier_Number,Supplier_Name,Item_Status_Code
0,random text,random text,random text,random text,0.760261,223,random text,195.306035,8.105034e+06,469.318913,...,2827.686067,2.998104e+06,random text,random text,random text,random text,2001-07-04,random text,random text,1
1,random text,random text,random text,random text,0.582487,185,random text,126.586278,1.615573e+06,108.948563,...,2681.319577,1.196800e+06,random text,random text,random text,random text,2011-10-05,random text,random text,1
2,random text,random text,random text,random text,0.752160,178,random text,132.587661,8.010683e+06,290.354216,...,1520.571316,8.889347e+05,random text,random text,random text,random text,2003-05-05,random text,random text,1
3,random text,random text,random text,random text,0.850549,207,random text,271.795281,5.851519e+06,468.748702,...,5416.027099,2.515904e+06,random text,random text,random text,random text,2008-06-05,random text,random text,1
4,random text,random text,random text,random text,0.633861,270,random text,367.691936,4.942879e+06,477.666990,...,3881.468722,6.772164e+05,random text,random text,random text,random text,2008-10-15,random text,random text,1
5,random text,random text,random text,random text,0.385189,249,random text,262.157876,7.962414e+06,111.340883,...,2859.741059,1.278107e+06,random text,random text,random text,random text,2001-04-04,random text,random text,1
6,random text,random text,random text,random text,0.680826,70,random text,66.007860,6.480366e+06,582.132068,...,1122.176666,2.283849e+06,random text,random text,random text,random text,2012-06-06,random text,random text,1
7,random text,random text,random text,random text,0.638170,147,random text,326.415090,5.513623e+06,370.715114,...,2338.463333,2.984335e+06,random text,random text,random text,random text,2008-07-09,random text,random text,1
8,random text,random text,random text,random text,0.390517,215,random text,182.133827,5.998815e+06,124.146810,...,1123.258915,6.654679e+05,random text,random text,random text,random text,2010-05-05,random text,random text,1
9,random text,random text,random text,random text,0.197937,154,random text,215.427062,3.734918e+06,129.887420,...,2931.612681,2.307193e+06,random text,random text,random text,random text,2004-02-26,random text,random text,1


In [4]:
p = Path.cwd()
obfuscate_excel(p / 'data' / 'data.xls', dayfirst=True, number_of_rows=100)

Unnamed: 0,District Code,Purchase Order Number,Purchase Order Item Number,Warehouse ID,Item Description Detail,Date of Purchase Order,Current Due Date,Original QTY UOI,Current QTY UOI,Original Net Price UOI,...,Purchase Officer Name,Requested By ID,Requested By Name,Authorised By ID,Authorised By Name,Request by Pos,New Requested,New Requested By,Unnamed: 35,Waiver Indicator
0,random text,random text,1,random text,random text,2004-05-09,2018-05-10,0,0,5047648.0,...,random text,76715,random text,78571,random text,3847,25078.706632,random text,0,random text
1,random text,random text,1,random text,random text,2014-08-10,2004-11-17,0,0,4211645.0,...,random text,76596,random text,73742,random text,1635,64936.433,random text,0,random text
2,random text,random text,1,random text,random text,2006-03-10,2015-02-18,0,0,3223764.0,...,random text,75553,random text,77054,random text,3942,23246.371111,random text,0,random text
3,random text,random text,1,random text,random text,2011-04-11,2018-06-14,0,0,9841358.0,...,random text,72812,random text,74330,random text,3119,54548.341954,random text,0,random text
4,random text,random text,1,random text,random text,2011-08-21,2006-09-18,0,0,3833806.0,...,random text,80510,random text,68144,random text,2862,62174.798694,random text,0,random text


Unnamed: 0,District Code,Purchase Order Number,Purchase Order Item Number,Warehouse ID,Item Description Detail,Date of Purchase Order,Current Due Date,Original QTY UOI,Current QTY UOI,Original Net Price UOI,...,Purchase Officer Name,Requested By ID,Requested By Name,Authorised By ID,Authorised By Name,Request by Pos,New Requested,New Requested By,Unnamed: 35,Waiver Indicator
0,random text,random text,1,random text,random text,2004-05-24,2018-05-25,0,0,9.589672e+06,...,random text,75755,random text,78281,random text,2623,48551.912050,random text,0,random text
1,random text,random text,1,random text,random text,2014-08-25,2004-12-02,0,0,7.607661e+06,...,random text,80962,random text,71353,random text,2852,15177.755150,random text,0,random text
2,random text,random text,1,random text,random text,2006-03-25,2015-03-05,0,0,6.599571e+06,...,random text,70930,random text,69185,random text,4102,46637.450473,random text,0,random text
3,random text,random text,1,random text,random text,2011-04-26,2018-06-29,0,0,3.894846e+06,...,random text,79836,random text,77719,random text,1127,72819.367288,random text,0,random text
4,random text,random text,1,random text,random text,2011-09-05,2006-10-03,0,0,4.710326e+06,...,random text,79813,random text,68325,random text,3429,51426.545825,random text,0,random text
5,random text,random text,1,random text,random text,2004-02-23,2011-11-04,0,0,4.952511e+06,...,random text,67134,random text,77726,random text,3649,71801.530191,random text,0,random text
6,random text,random text,1,random text,random text,2015-04-27,2012-03-15,0,0,8.226127e+06,...,random text,71480,random text,73361,random text,3925,44605.505546,random text,0,random text
7,random text,random text,1,random text,random text,2011-05-30,2018-02-13,0,0,8.625302e+06,...,random text,67245,random text,73480,random text,1559,28073.534635,random text,0,random text
8,random text,random text,1,random text,random text,2013-03-25,2004-09-02,0,0,3.858707e+06,...,random text,73389,random text,82549,random text,2208,57650.467604,random text,0,random text
9,random text,random text,1,random text,random text,2007-01-16,2015-11-05,0,0,7.107095e+06,...,random text,79022,random text,70788,random text,4072,64710.051563,random text,0,random text


In [6]:
!jupyter nbconvert \
    --TagRemovePreprocessor.enabled=True \
    --TagRemovePreprocessor.remove_cell_tags="['build', 'test']" \
    --TemplateExporter.exclude_output=True \
    --to python "obfuscate_files.ipynb"

first_line = """'Obfuscate files package'

__version__ = '0.1'

"""
script_file = Path.cwd() / 'obfuscate_files.py'
script = script_file.read_text()
script_file.write_text(first_line + script)
username = script_file.parent.parent.name
system_name = script_file.parent.name
standardised_script_name = f'pipomatic_{username}_{system_name}.py'
script_file.replace(script_file.parent / standardised_script_name)
standardised_script_name

[NbConvertApp] Converting notebook obfuscate_files.ipynb to python
[NbConvertApp] Writing 3586 bytes to obfuscate_files.py


'pipomatic_pipomatic_obfuscate_files.py'

In [8]:
!black "pipomatic_pipomatic_obfuscate_files.py"

reformatted pipomatic_pipomatic_obfuscate_files.py
All done! \u2728 \U0001f370 \u2728
1 file reformatted.
