## 데이터 수집

In [46]:
from google.cloud import bigquery
from google.cloud.bigquery import job
from google.cloud.bigquery import SchemaField
import pandas as pd
import datetime as dt
import os
from datetime import datetime
pd.options.display.max_columns = None

PROJECT = 'ballosodeuk'
client = bigquery.Client(project=PROJECT)

def estimate_query_cost(client, query):
    job_config = bigquery.QueryJobConfig(
        dry_run=True,
        use_query_cache=False
    )
    query_job = client.query(query, job_config=job_config)
    bytes_processed = query_job.total_bytes_processed
    kb = round(bytes_processed / 1024, 2)
    gb = round(bytes_processed / (1024 ** 3), 2)
    print("예상 쿼리 용량: {} KB, ({} GB)".format(kb, gb))

def run_query(client, query):
    job_config = bigquery.QueryJobConfig()
    query_job = client.query(query, job_config=job_config)
    return query_job

In [30]:
query_name = "1009_rfm_branch_1"
file = f'../query/{query_name}.sql'
with open(file, 'r') as file:
    query = file.read()

# 쿼리가 비어있지 않은지 확인
if query.strip():
    try:
        estimate_query_cost(client, query)
    except Exception as e:
        print(f"쿼리 비용 추정 중 오류 발생: {str(e)}")
        print("쿼리의 처음 500자:")
        print(query[:500])
else:
    print("쿼리가 비어 있습니다. SQL 파일의 내용을 확인해 주세요.")

예상 쿼리 용량: 137565684.8 KB, (131.19 GB)


- 총 쿼리 한번에 돌리기

In [31]:
df = run_query(client,query).to_dataframe()



In [37]:
for col in df.columns:
    if df[col].dtype == 'dbdate':
        df[col] = df[col].astype(str)

# Parquet 파일로 저장
df.to_parquet('./0301_1007.parquet')

In [41]:
pd.read_parquet('./0301_1007.parquet')


Unnamed: 0,User_ID,Platform,Reg_Dttm,Cum_Lifetime,Cum_Cost_Recent,Cum_Cost_min,Avg_Daily_Cost_Period,Avg_Daily_Cost_Lifetime,Monatary_Offerwall,Monatary_Network,...,Cost_친구초대입력,Freq_친구초대입력,Cost_아바티,Freq_아바티,Cost_박터뜨리기,Freq_박터뜨리기,Cost_핀크럭스,Freq_핀크럭스,Cost_지원금보상,Freq_지원금보상
0,2788409f-6cef-419f-b3ec-e2600c93e73a,iOS,2023-10-07,367,64072.0,42278.0,99.06,174.58,4287.45,1548.0,...,,,,,-162.0,8,,,,
1,76889522-8247-4280-956a-74a46a414e0d,Android,2023-06-30,466,58084.0,48249.0,44.70,124.64,0.00,223.2,...,,,,,,,,,,
2,010f343b-6b7b-420e-bf33-412a75aa9ece,iOS,2023-11-15,328,2955.0,2074.0,4.00,9.01,0.00,288.0,...,,,,,,,,,,
3,e4f93df3-b168-4c71-a794-6252f35ec284,Android,2024-03-21,201,72624.0,5760.0,303.93,361.31,1.73,244.8,...,,,,,-7159.0,583,,,,
4,18be887b-557b-46ed-a31d-b6e5c430fb2d,Android,2023-09-18,386,70136.0,53166.0,77.14,181.70,9.15,1159.2,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56250,a33f7406-21ab-4ec3-9b55-465668f13c21,,NaT,,,,0.00,0.00,0.00,0.0,...,,,,,,,,,,
56251,ad3a0c70-d7a0-44a0-80e4-256831b17312,iOS,2023-03-28,560,13715.0,13096.0,2.81,24.49,0.00,36.0,...,,,,,,,,,,
56252,087eb567-d7e2-480d-9d66-579574627913,Android,2024-06-12,118,67.0,0.0,0.30,0.57,0.00,14.4,...,,,,,,,,,,
56253,434efa88-bb23-4446-b153-b99d4cdae9e9,iOS,2023-12-18,295,1.0,1.0,0.00,0.00,0.00,14.4,...,,,,,,,,,,


- 청크 단위로 쿼리 돌리기

In [None]:
import os
from datetime import datetime
from google.cloud import bigquery
import pandas as pd

def run_query_and_save_chunks(client, query, chunk_size=10000, output_folder='output'):
    os.makedirs(output_folder, exist_ok=True)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    query_job = client.query(query)
    
    chunk_files = []
    chunk_count = 0
    rows_iter = query_job.result(page_size=chunk_size)
    
    # 열 이름과 타입 가져오기
    schema = query_job.result().schema
    column_names = [field.name for field in schema]
    column_types = {field.name: field.field_type for field in schema}
    
    for page in rows_iter.pages:
        chunk_count += 1
        data = [list(row.values()) for row in page]
        
        # DataFrame 생성
        chunk_df = pd.DataFrame(data, columns=column_names)
        
        # 날짜 열 변환
        for col, col_type in column_types.items():
            if col_type == 'DATE' or col_type == 'DATETIME':
                chunk_df[col] = pd.to_datetime(chunk_df[col])
        
        filename = f"{output_folder}/chunk_{timestamp}_{chunk_count:04d}.parquet"
        chunk_df.to_parquet(filename, index=False)
        chunk_files.append(filename)
        
        print(f"청크 {chunk_count} 저장됨: {filename} (크기: {len(chunk_df)} 행)")
    
    print(f"총 {chunk_count}개의 청크가 저장되었습니다.")
    return chunk_files

# 쿼리 실행 및 청크 저장
chunk_files = run_query_and_save_chunks(client, query, chunk_size=4000, output_folder='coupang_data')

# 저장된 파일들을 하나의 DataFrame으로 읽기 (필요한 경우)
df = pd.concat([pd.read_parquet(file) for file in chunk_files], ignore_index=True)

In [38]:
import os
import pandas as pd

# 'coupang_data' 폴더 경로
folder_path = 'coupang_data'

# 폴더 내 모든 파일 목록 가져오기
files = os.listdir(folder_path)

# parquet 파일만 필터링
parquet_files = [f for f in files if f.endswith('.parquet')]

if parquet_files:
    # 모든 parquet 파일을 읽어서 리스트에 저장
    df_list = []
    for file in parquet_files:
        file_path = os.path.join(folder_path, file)
        df = pd.read_parquet(file_path)
        df_list.append(df)
        print(f"파일 '{file}' 읽기 완료 (크기: {df.shape})")
    
    # 모든 데이터프레임을 하나로 합치기
    combined_df = pd.concat(df_list, ignore_index=True)
    
    print(f"\n모든 파일을 성공적으로 읽고 합쳤습니다.")
    print(f"최종 데이터프레임 크기: {combined_df.shape}")
    print("\n최종 데이터프레임의 처음 몇 행:")
    print(combined_df.head())
    
    # 중복 행 확인 및 제거 (필요한 경우)
    duplicates = combined_df.duplicated().sum()
    if duplicates > 0:
        print(f"\n중복된 행 수: {duplicates}")
        combined_df = combined_df.drop_duplicates()
        print(f"중복 제거 후 데이터프레임 크기: {combined_df.shape}")
else:
    print("'coupang_data' 폴더에 parquet 파일이 없습니다.")

파일 'chunk_20241009_163742_0001.parquet' 읽기 완료 (크기: (4000, 79))
파일 'chunk_20241009_163742_0002.parquet' 읽기 완료 (크기: (4000, 79))
파일 'chunk_20241009_163742_0003.parquet' 읽기 완료 (크기: (4000, 79))
파일 'chunk_20241009_163742_0004.parquet' 읽기 완료 (크기: (4000, 79))
파일 'chunk_20241009_163742_0005.parquet' 읽기 완료 (크기: (4000, 79))
파일 'chunk_20241009_163742_0006.parquet' 읽기 완료 (크기: (4000, 79))
파일 'chunk_20241009_163742_0007.parquet' 읽기 완료 (크기: (4000, 79))
파일 'chunk_20241009_163742_0008.parquet' 읽기 완료 (크기: (4000, 79))
파일 'chunk_20241009_163742_0009.parquet' 읽기 완료 (크기: (4000, 79))
파일 'chunk_20241009_163742_0010.parquet' 읽기 완료 (크기: (4000, 79))
파일 'chunk_20241009_163742_0011.parquet' 읽기 완료 (크기: (4000, 79))
파일 'chunk_20241009_163742_0012.parquet' 읽기 완료 (크기: (4000, 79))
파일 'chunk_20241009_163742_0013.parquet' 읽기 완료 (크기: (4000, 79))
파일 'chunk_20241009_163742_0014.parquet' 읽기 완료 (크기: (4000, 79))
파일 'chunk_20241009_163742_0015.parquet' 읽기 완료 (크기: (255, 79))

모든 파일을 성공적으로 읽고 합쳤습니다.
최종 데이터프레임 크기: (56255, 79)

최종 데이

  combined_df = pd.concat(df_list, ignore_index=True)


In [39]:
combined_df

Unnamed: 0,User_ID,Platform,Reg_Dttm,Cum_Lifetime,Cum_Cost_Recent,Cum_Cost_min,Avg_Daily_Cost_Period,Avg_Daily_Cost_Lifetime,Monatary_Offerwall,Monatary_Network,...,Cost_친구초대입력,Freq_친구초대입력,Cost_아바티,Freq_아바티,Cost_박터뜨리기,Freq_박터뜨리기,Cost_핀크럭스,Freq_핀크럭스,Cost_지원금보상,Freq_지원금보상
0,505c1e87-2792-4101-9f0e-23f487036ea8,Android,2024-09-08,30.0,1345.0,443.0,4.10,44.83,0.00,50.4,...,,,,,,,,,,
1,22a69a1e-ae39-4ea2-96f1-12f1475fedb0,Android,2024-01-16,266.0,39543.0,19106.0,92.90,148.66,21.82,1202.4,...,,,,,,,,,,
2,27bc6c05-c1a0-4893-ac4e-aa355560c14d,Android,2023-06-30,466.0,37480.0,19978.0,79.55,80.43,1761.65,180.0,...,,,,,,,,,,
3,9efb1d4c-d3d1-48e7-a42d-a7069082fc6e,Android,2023-08-02,433.0,430543.0,314074.0,529.40,994.33,47.85,1764.0,...,,,,,-12202.0,906.0,,,,
4,189ab1dc-61aa-4f85-90d3-9a3b825926f9,Android,2023-12-27,286.0,152306.0,53117.0,450.86,532.54,263.08,871.2,...,,,,,-1047.0,84.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56250,d4ad6738-6312-43e2-b166-da4ff2465302,Android,2024-09-11,27.0,12.0,12.0,0.00,0.44,0.00,7.2,...,,,,,,,,,,
56251,f48a189c-768b-40f9-b1c9-0122572bee9f,Android,2024-09-23,15.0,1770.0,158.0,7.33,118.00,4.15,28.8,...,,,,,,,,,,
56252,5fefd989-978a-458c-be92-2ea88448bd55,Android,2024-09-09,29.0,0.0,0.0,0.00,0.00,0.00,7.2,...,,,,,,,,,,
56253,5988c312-35b2-4153-b47f-b4a8b6a4c431,Android,2023-10-14,360.0,169.0,19.0,0.68,0.47,0.00,14.4,...,,,,,,,,,,


> 결과적으로 둘 다 맞고 메모리 크게 부하 없었음.
아무거나 써도 됨

## EDA

In [48]:
df.fillna(0,inplace=True)
df.columns

Index(['User_ID', 'Platform', 'Reg_Dttm', 'Cum_Lifetime', 'Cum_Cost_Recent',
       'Cum_Cost_min', 'Avg_Daily_Cost_Period', 'Avg_Daily_Cost_Lifetime',
       'Monatary_Offerwall', 'Monatary_Network', 'Monatary_INL_Launcher',
       'Monatary_INL_Challenge', 'Monatary_INL_Quiz', 'Monatary_Dynamic',
       'Monatary_Total', 'Recency_Offerwall', 'Recency_Network',
       'Recency_INL_Launcher', 'Recency_INL_Challenge', 'Recency_INL_Quiz',
       'Recency_Dynamic', 'Recncy_Total', 'Recency_Total_NoAd',
       'Frequency_Offerwall', 'Frequency_Network', 'Frequency_INL_Launcher',
       'Frequency_INL_Challenge', 'Frequency_INL_Quiz', 'Frequency_Dynamic',
       'Frequency_Total', 'Frequency_Total_NoAd', 'Frequency_Offerwall_c',
       'Frequency_Network_c', 'Frequency_INL_Launcher_c',
       'Frequency_INL_Challenge_c', 'Frequency_INL_Quiz_c',
       'Frequency_Dynamic_c', 'Frequency_Total_c', 'Frequency_Total_NoAd_c',
       'Cost_기본걸음적립', 'Freq_기본걸음적립', 'Cost_버튼누르고바로지급', 'Freq_버튼누르고바로지급'

In [52]:
df[[
    'User_ID', 'Cum_Lifetime', 'Cum_Cost_Recent','Cum_Cost_min', 'Avg_Daily_Cost_Period', 'Avg_Daily_Cost_Lifetime',
    'Monatary_INL_Launcher','Monatary_INL_Challenge', 'Monatary_INL_Quiz', 'Monatary_Dynamic'
    ]]

Unnamed: 0,User_ID,Cum_Lifetime,Cum_Cost_Recent,Cum_Cost_min,Avg_Daily_Cost_Period,Avg_Daily_Cost_Lifetime,Monatary_INL_Launcher,Monatary_INL_Challenge,Monatary_INL_Quiz,Monatary_Dynamic
0,fe10b571-c968-48e4-9cde-2b5c24012e6a,508.0,53337.0,41413.0,54.20,104.99,0.0,0.0,0.0,934.0
1,2901471c-ab98-40c8-b889-3a70496a13a8,499.0,12387.0,10653.0,7.88,24.82,0.0,0.0,0.0,6169.0
2,325cc638-96d5-42e8-99cc-b3e507c9cabd,345.0,44681.0,32205.0,56.71,129.51,1370.0,0.0,0.0,0.0
3,3443a027-0fb8-48b8-88a0-22cc039a2fc3,362.0,42000.0,14638.0,124.37,116.02,9177.0,408.0,0.0,6692.0
4,c9544ae2-bffc-4abc-8b7e-a8865ea7c57c,603.0,56937.0,43618.0,60.54,94.42,0.0,0.0,0.0,788.0
...,...,...,...,...,...,...,...,...,...,...
250,d4ad6738-6312-43e2-b166-da4ff2465302,27.0,12.0,12.0,0.00,0.44,473.0,0.0,0.0,0.0
251,f48a189c-768b-40f9-b1c9-0122572bee9f,15.0,1770.0,158.0,7.33,118.00,0.0,0.0,0.0,662.0
252,5fefd989-978a-458c-be92-2ea88448bd55,29.0,0.0,0.0,0.00,0.00,0.0,0.0,1932.0,0.0
253,5988c312-35b2-4153-b47f-b4a8b6a4c431,360.0,169.0,19.0,0.68,0.47,0.0,0.0,0.0,1062.0


df.query("")