<a href="https://colab.research.google.com/github/pratyushlokhande/BigData-Privacy-Preservation/blob/main/PrivacyPreservation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Generating Dataset**

In [62]:
# importing Libraries
import pandas as pd
import random
!pip install Faker
from faker import Faker
from random import randrange
import time




In [79]:
# initialisations 
user_count = 100

fake = Faker()

users = []

In [80]:
# Generating Data
for user_id in range(user_count):

    #create customer's name
    name = fake.name()

    # Create gender
    gender = random.choice(["M", "F"])

    # Create age
    age = random.randint(18, 60)

    # Create email 
    email = fake.ascii_email()

    #create product ID in 8-digit barcode
    pincode = fake.ean(length=8)[:6]
    
    #create amount spent
    salary = fake.pyfloat(right_digits=2, positive=True, min_value=1, max_value=100)*1000

    users.append([name, gender, age, email, pincode, salary])


In [81]:
# converting to dataframe
df = pd.DataFrame(users, columns=['Name', 'Gender','Age', 'Email', 'Pincode', 'Salary'])
df

Unnamed: 0,Name,Gender,Age,Email,Pincode,Salary
0,Lucas Webb,F,38,elijah25@gmail.com,707876,30000.0
1,Brian Johnson,M,58,rebeccaholmes@roth.com,739289,41680.0
2,Pamela Murillo,M,48,jeffreygonzales@roberson.com,069659,91110.0
3,Dr. Sandra Benson,M,43,whitney41@moreno.com,877689,48150.0
4,Edward Khan,M,51,debragarcia@hotmail.com,416872,84800.0
...,...,...,...,...,...,...
95,Teresa Harris,F,57,ronaldwest@hotmail.com,949812,60600.0
96,Mitchell Jones,M,47,stacybell@smith.biz,585756,75300.0
97,Juan Pineda,M,22,turnersylvia@cole.com,018785,70540.0
98,Spencer Hawkins,M,50,isalinas@mclean-miller.com,012689,91850.0


**Weight Table**

In [82]:
# Weight Table
weight_table = {
              'name': 4,
              'gender': 2,
              'age': 3, 
              'email': 5,
              'pincode': 6,
              'salary': 1   
            }

weight_table

{'age': 3, 'email': 5, 'gender': 2, 'name': 4, 'pincode': 6, 'salary': 1}

**Encrypting Algorithms**

In [86]:
# Encryption Algorithms

# for age encryption
def rangeEncryption(data):
  start = time.time()
  res = []
  for val in data:
    res.append("[" + str(int(val/10)*10) + "-" + str(int(val/10)*10+9) + "]")
  end = time.time()
  t = (end - start)*1000
  return {'data': res, 'time': t}


# for pincode encryption 
def pinEncryption(data):
  start = time.time()
  res = []
  for val in data:
    res.append(val[:3] + "**" + val[-1:])
  end = time.time()
  t = (end - start)*1000
  return {'data': res, 'time': t}


# for email encryption
def mailEncryption(data):
  start = time.time()
  res = []
  for val in data:
    updatedVal = val.split("@")[0][0] + "*"*(len(val.split("@")[0])-3) + val.split("@")[0][-2:] + "@" + val.split("@")[1]
    res.append(updatedVal)
  end = time.time()
  t = (end - start)*1000
  return {'data': res, 'time': t}

**S-Table**

In [137]:
# S-Table
s_table = {
              'age': weight_table['age']/rangeEncryption(df['Age'])['time'], 
              'email': weight_table['email']/mailEncryption(df['Email'])['time'],
              'pincode': weight_table['pincode']/pinEncryption(df['Pincode'])['time'],
            }

print(s_table)

# Sorting S-Table
# print(dict(sorted(s_table.items(), key=lambda item: item[1], reverse=True)))

{'age': 8.943078891257995, 'email': 13.697922926192032, 'pincode': 53.20470190274841}


**Execution Time**

In [138]:
def getExecTime(data, algo, flag):
  start = time.time()
  data = data.unique()
  if flag:
    data = algo(data)['data']
  print(sorted(data))
  end = time.time()
  t = (end - start)*1000
  return t


**M-Table**

In [185]:
# M-Table
m_table = {
              'age': { 'amount': len(df['Age'].unique()), 
                      'with_encrypt': { 'time': getExecTime(df['Age'], rangeEncryption, True), 'weight': weight_table['age'] },
                      'without_encrypt': { 'time': getExecTime(df['Age'], rangeEncryption, False), 'weight': 0 }  
                     },
              'email': { 'amount': len(df['Email'].unique()), 
                      'with_encrypt': { 'time': getExecTime(df['Email'], mailEncryption, True), 'weight': weight_table['email'] },
                      'without_encrypt': { 'time': getExecTime(df['Email'], mailEncryption, False), 'weight': 0 }  
                     }, 
              'pincode': { 'amount': len(df['Pincode'].unique()), 
                      'with_encrypt': { 'time': getExecTime(df['Pincode'], pinEncryption, True), 'weight': weight_table['pincode'] },
                      'without_encrypt': { 'time': getExecTime(df['Pincode'], pinEncryption, False), 'weight': 0 }  
                     }
          }

['[10-19]', '[10-19]', '[20-29]', '[20-29]', '[20-29]', '[20-29]', '[20-29]', '[20-29]', '[20-29]', '[20-29]', '[20-29]', '[30-39]', '[30-39]', '[30-39]', '[30-39]', '[30-39]', '[30-39]', '[30-39]', '[40-49]', '[40-49]', '[40-49]', '[40-49]', '[40-49]', '[40-49]', '[40-49]', '[40-49]', '[40-49]', '[40-49]', '[50-59]', '[50-59]', '[50-59]', '[50-59]', '[50-59]', '[50-59]', '[50-59]', '[50-59]', '[50-59]', '[60-69]']
[18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 32, 33, 34, 35, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 55, 56, 57, 58, 59, 60]
['a***********ez@gmail.com', 'a**********er@bennett.com', 'a**********er@parsons-bailey.biz', 'a********on@camacho.com', 'a*****39@green.net', 'a*****47@hotmail.com', 'a*****55@hotmail.com', 'a*****79@hotmail.com', 'b**********as@gmail.com', 'b*******93@abbott.com', 'b*******on@perkins-moore.biz', 'b*****40@hotmail.com', 'b*****oe@gibson.com', 'b****on@dickson.org', 'c**********ny@yahoo.com', 'c*********ry@gmail.com', 'c****

In [186]:
print(m_table)

{'age': {'amount': 38, 'with_encrypt': {'time': 1.6617774963378906, 'weight': 3}, 'without_encrypt': {'time': 0.7681846618652344, 'weight': 0}}, 'email': {'amount': 100, 'with_encrypt': {'time': 0.9837150573730469, 'weight': 5}, 'without_encrypt': {'time': 0.606536865234375, 'weight': 0}}, 'pincode': {'amount': 100, 'with_encrypt': {'time': 0.5309581756591797, 'weight': 6}, 'without_encrypt': {'time': 0.41747093200683594, 'weight': 0}}}


**Time Constrains**

In [220]:
# finding time constrains

tc = 7 # given

tm = 0
for obj in m_table:
  tm += m_table[obj]['without_encrypt']['time']

tp = 0
for obj in m_table:
  tp += m_table[obj]['with_encrypt']['time']

print(tc, tm, tp)


7 1.7921924591064453 3.176450729370117


**DED**

In [221]:
# DED

def DED():
  p = []
  ts = tc - (tm + tp)

  for obj in s_table:
    print(ts)
    if ts > (m_table[obj]['with_encrypt']['time'] - m_table[obj]['without_encrypt']['time']):
      p.append(obj)
      ts -= (m_table[obj]['with_encrypt']['time'] - m_table[obj]['without_encrypt']['time'])
    else:
      break
  return p


In [204]:
print(DED()) # tc = 6.3

1.3313568115234373
0.4377639770507811
0.0605857849121092
['age', 'email']


In [207]:
print(DED()) # tc = 6

1.0313568115234375
0.13776397705078125
['age']


In [210]:
print(DED()) # tc = 7

2.0313568115234375
1.1377639770507812
0.7605857849121094
['age', 'email', 'pincode']


**Encrypting the Data based on choice**

In [None]:
# Encrypt data available in p
for obj in DED():
  if obj == 'age':
    df['Age'] = rangeEncryption(df['Age'])['data']
  elif obj == 'email':
    df['Email'] = mailEncryption(df['Email'])['data']
  else :
    df['Pincode'] = pinEncryption(df['Pincode'])['data']

In [216]:
df # tc = 7

Unnamed: 0,Name,Gender,Age,Email,Pincode,Salary
0,Lucas Webb,F,[30-39],e*****25@gmail.com,707**6,30000.0
1,Brian Johnson,M,[50-59],r**********es@roth.com,739**9,41680.0
2,Pamela Murillo,M,[40-49],j************es@roberson.com,069**9,91110.0
3,Dr. Sandra Benson,M,[40-49],w******41@moreno.com,877**9,48150.0
4,Edward Khan,M,[50-59],d********ia@hotmail.com,416**2,84800.0
5,Stephanie Ball,M,[40-49],t*********is@hotmail.com,170**4,30370.0
6,Monica Robinson,M,[50-59],k*******59@parker-carroll.org,597**9,16530.0
7,Annette Burns,M,[50-59],t*******on@cooper.com,877**7,9410.0
8,Carl Marquez,M,[30-39],v****er@harris.com,556**8,18770.0
9,Margaret Thomas,F,[60-69],c**********ny@yahoo.com,982**2,96170.0
