In [8]:
import pickle
import numpy as np
import pandas as pd
import time
from tqdm import tqdm

## synthetic data(pkl파일)와 data(csv파일) 두 데이터셋에서 각 행을 비교해서 동일한 행이 몇 개인지 확인

## Dataframe

In [12]:

dtypes = {'age':'int','workclass':'category','education':'category','marital_status':'category','occupation':'category','relationship':'category','race':'category',
          'gender':'category','capital_gain':'int','capital_loss':'int','hours_per_week':'int','native_country':'category','income':'category'}
columns = list(dtypes.keys())
data = pd.read_csv('adultSalary.csv', names=columns).astype(dtypes)
data

Unnamed: 0,age,workclass,education,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,39,Private,Bachelors,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K
48838,64,?,HS-grad,Widowed,?,Other-relative,Black,Male,0,0,40,United-States,<=50K
48839,38,Private,Bachelors,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,<=50K
48840,44,Private,Bachelors,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,<=50K


In [13]:
n = 20000
n = len(data)
data = data[:n]

### df_synthpop

In [14]:
# pkl 파일 경로를 지정합니다.
df_synthpop_pkl = '/home/casey/workspace/BigDataSystem/df_synthpop.pickle'
# 파일을 읽기 모드(rb)로 열어서 데이터를 로드합니다.
with open(df_synthpop_pkl, 'rb') as f:
    synth_data = pickle.load(f)
    
# 데이터를 확인합니다.
synth_data


Unnamed: 0,age,workclass,education,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income
0,67,?,HS-grad,Married-civ-spouse,?,Wife,White,Female,0,0,4,United-States,<=50K
1,21,Private,Some-college,Never-married,Craft-repair,Own-child,White,Male,0,0,40,United-States,<=50K
2,55,Self-emp-not-inc,Bachelors,Married-civ-spouse,Sales,Husband,White,Male,0,0,30,United-States,<=50K
3,33,Private,Bachelors,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,65,United-States,>50K
4,53,Local-gov,11th,Married-civ-spouse,Other-service,Husband,White,Male,0,0,40,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,26,Private,10th,Never-married,Craft-repair,Other-relative,Black,Male,0,0,20,United-States,<=50K
48838,33,Self-emp-not-inc,HS-grad,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K
48839,71,Self-emp-inc,Some-college,Widowed,Exec-managerial,Not-in-family,White,Female,0,0,24,United-States,<=50K
48840,25,Private,HS-grad,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K


In [15]:
synth_data = synth_data[:n-int(n*0.1)]
synth_data = pd.concat([data[:int(n*0.1)], synth_data])

In [None]:
stime = time.time()
data_list = []
synth_data_list = []
for i in range(len(synth_data)):
    data_list.append(data.iloc[i, :].values.flatten().tolist())
    synth_data_list.append(synth_data.iloc[i, :].values.flatten().tolist())

cnt = 0
for row1 in tqdm(synth_data_list):
    for row2 in data_list:
        if row1 == row2:
            cnt += 1
            break
print("List 방식 시간:", time.time() - stime)   # 결과 151.6037793159485
print("중복률:", cnt / len(synth_data))        # 결과 0.4122886040702674


100%|██████████| 48842/48842 [02:29<00:00, 327.24it/s]

151.6037793159485
0.4122886040702674





### list로 바꿔서 해보기
- 이렇게 하면 원하는 값이 있는지 확인할 수 있음
- DataFrame과 List 속도 차이 : List, array로 바꿔서 사용하면 더 빠름

- numpy : 안에서 C, 포트란?? 으로 돌아서.빠름
- list
- 1) 문제 해결 시간 단축 노력 해보기~
- 2) cython? 은 어려움

### Numpy

In [None]:
stime = time.time()
a = data.to_numpy()
b = synth_data.to_numpy()

cnt = sum([np.any(np.all(row == a, axis=1)) for row in b])

print("Numpy 방식 시간:", time.time() - stime)
print("중복률:", cnt / len(b))

### Cupy

In [None]:
import cupy as cp

# 데이터 CuPy 배열로 변환
stime = time.time()
a_gpu = cp.asarray(data.to_numpy())
b_gpu = cp.asarray(synth_data.to_numpy())

cnt = 0
for i in range(b_gpu.shape[0]):
    if cp.any(cp.all(b_gpu[i] == a_gpu, axis=1)):
        cnt += 1

etime = time.time()

print("CuPy(GPU) 방식 시간:", etime - stime)
print("중복률:", cnt / b_gpu.shape[0])
