In [14]:
import pandas as pd
import os
import sys
import json
import requests
import gc
from tqdm import tqdm
from datetime import datetime, timedelta
import datetime as dt
import openpyxl
import re
import datetime as dt
import shutil


import warnings
warnings.filterwarnings(action='ignore')

import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

# 폰트 경로 직접 지정
font_path = 'C:/Windows/Fonts/malgun.ttf'  # 맑은 고딕의 경로
font_prop = fm.FontProperties(fname=font_path, size=12)

# 전역 폰트 설정
plt.rcParams['font.family'] ='Malgun Gothic'  # 맑은 고딕으로 설정
plt.rcParams['axes.unicode_minus'] = False  # 마이너스 기호 깨짐 방지

In [29]:
import os
import shutil
import pandas as pd
from tqdm import tqdm

def get_merge(folder_path):
    # List all files in the directory
    files = [f for f in os.listdir(folder_path) if f.endswith('.parquet')]
    
    # List to store individual DataFrames
    dataframes = []
    
    # Read each Parquet file and append to the list
    for file in tqdm(files, desc="Reading Parquet files", unit="file"):
        file_path = os.path.join(folder_path, file)
        df = pd.read_parquet(file_path)
        dataframes.append(df)
    
    # Concatenate all DataFrames in the list
    concatenated_df = pd.concat(dataframes, ignore_index=True)
    
    return concatenated_df

def merge_orders(concat_orders, concat_cancels):
    concat_cancels.drop(['date'], axis=1, inplace=True)
    concat_cancels.rename(columns={'orderDate': 'date'}, inplace=True)
    standard = ['date', 'trackingCode', 'subId', 'subParam', 'addtag', 'ctag', 'orderId', 'productId', 'click_time']

    # Merging DataFrames with left join and indicator
    merged_df = pd.merge(concat_orders, concat_cancels, on=standard, how='left', indicator=True)

    # Filtering records only in orders
    only_in_order = merged_df[merged_df['_merge'] == 'left_only']
    both_in = merged_df[merged_df['_merge'] == 'both']

    # Printing lengths for verification
    print(f"Original Order Length: {len(concat_orders)}")
    print(f"Merged DataFrame Length: {len(merged_df)}")
    print(f"Only in Order DataFrame Length: {len(only_in_order)}")
    print(f"Both in DataFrames Length: {len(both_in)}")

    # Removing records in orders that are present in cancels
    # No need to drop duplicates since we're already filtering out canceled records
    orders_cleaned = only_in_order.drop(columns=['_merge'])
    orders_cleaned = orders_cleaned.iloc[:, 0:14]

    print(f"Length after removing records present in cancels: {len(orders_cleaned)}")

    # Display cleaned orders
    print("Cleaned Orders DataFrame:")
    return orders_cleaned

def archive_files(files, src_folder, dst_folder):
    for file in tqdm(files, desc="Moving Parquet files", unit="file"):
        if file.endswith('.parquet'):
            src = os.path.join(src_folder, file)
            dst = os.path.join(dst_folder, file)
            shutil.move(src, dst)
            print(f"Moved {file} to {dst_folder}")

def get_tg_folder(key):

    root = r"C:\FTC_downloads\code\git\project_hct\result\cp"
    orders = rf"{key}\orders"
    cancels = rf"{key}\cancels"
    orders_to = rf"{key}\archived_orders"
    cancels_to = rf"{key}\archived_cancels"
    results_to = rf"{key}\results"
    
    orders_folder = os.path.join(root, orders)
    cancels_folder = os.path.join(root, cancels)
    orders_to_folder = os.path.join(root, orders_to)
    cancels_to_folder = os.path.join(root, cancels_to)
    results_to_folder = os.path.join(root, results_to)

    concat_orders = get_merge(orders_folder)
    concat_cancels = get_merge(cancels_folder)
    
    orders_cleaned = merge_orders(concat_orders, concat_cancels)

    start_date = orders_cleaned.date.min()
    end_date = orders_cleaned.date.max()

    orders_cleaned.to_csv(rf'{results_to_folder}\{key}_{start_date}_{end_date}_cleanedAndMerged.csv', encoding='utf-8-sig')
    orders_cleaned.to_excel(rf'{results_to_folder}\{key}_{start_date}_{end_date}_cleanedAndMerged.xlsx', engine="openpyxl")
    
    order_files = os.listdir(orders_folder)
    cancel_files = os.listdir(cancels_folder)
    
    archive_files(order_files, orders_folder, orders_to_folder)
    archive_files(cancel_files, cancels_folder, cancels_to_folder)

# Example usage
get_tg_folder("dynamic")
get_tg_folder("reco")


Reading Parquet files: 100%|██████████| 3/3 [00:00<00:00, 41.92file/s]
Reading Parquet files: 100%|██████████| 3/3 [00:00<00:00, 200.65file/s]


Original Order Length: 90820
Merged DataFrame Length: 90826
Only in Order DataFrame Length: 83394
Both in DataFrames Length: 7432
Length after removing records present in cancels: 83394
Cleaned Orders DataFrame:


Moving Parquet files: 100%|██████████| 3/3 [00:00<00:00, 1505.31file/s]


Moved cpDynamic_orders_20240501_20240515.parquet to C:\FTC_downloads\code\git\project_hct\result\cp\dynamic\archived_orders
Moved cpDynamic_orders_20240516_20240531.parquet to C:\FTC_downloads\code\git\project_hct\result\cp\dynamic\archived_orders
Moved cpDynamic_orders_20240601_20240617.parquet to C:\FTC_downloads\code\git\project_hct\result\cp\dynamic\archived_orders


Moving Parquet files: 100%|██████████| 3/3 [00:00<00:00, 3010.27file/s]


Moved cpDynamic_cancels_20240501_20240515.parquet to C:\FTC_downloads\code\git\project_hct\result\cp\dynamic\archived_cancels
Moved cpDynamic_cancels_20240516_20240531.parquet to C:\FTC_downloads\code\git\project_hct\result\cp\dynamic\archived_cancels
Moved cpDynamic_cancels_20240601_20240617.parquet to C:\FTC_downloads\code\git\project_hct\result\cp\dynamic\archived_cancels


Reading Parquet files: 100%|██████████| 3/3 [00:00<00:00, 42.83file/s]
Reading Parquet files: 100%|██████████| 3/3 [00:00<00:00, 232.05file/s]


Original Order Length: 126816
Merged DataFrame Length: 126828
Only in Order DataFrame Length: 117208
Both in DataFrames Length: 9620
Length after removing records present in cancels: 117208
Cleaned Orders DataFrame:


Moving Parquet files: 100%|██████████| 3/3 [00:00<00:00, 920.01file/s]


Moved cpReco_orders_20240501_20240515.parquet to C:\FTC_downloads\code\git\project_hct\result\cp\reco\archived_orders
Moved cpReco_orders_20240516_20240531.parquet to C:\FTC_downloads\code\git\project_hct\result\cp\reco\archived_orders
Moved cpReco_orders_20240601_20240617.parquet to C:\FTC_downloads\code\git\project_hct\result\cp\reco\archived_orders


Moving Parquet files: 100%|██████████| 3/3 [00:00<00:00, 1504.23file/s]

Moved cpReco_cancels_20240501_20240515.parquet to C:\FTC_downloads\code\git\project_hct\result\cp\reco\archived_cancels
Moved cpReco_cancels_20240516_20240531.parquet to C:\FTC_downloads\code\git\project_hct\result\cp\reco\archived_cancels
Moved cpReco_cancels_20240601_20240617.parquet to C:\FTC_downloads\code\git\project_hct\result\cp\reco\archived_cancels





---

상기 내용이 하단 내용 정리한 내용

In [40]:
report_type = 'orders'
start_date = "20240605"
end_date = "20240611"

orders = pd.read_parquet(f'../result/cp2_{report_type}_{start_date}_{end_date}.parquet')

In [41]:
report_type = 'cancels'
start_date = "20240605"
end_date = "20240611"

cancels = pd.read_parquet(f'../result/cp2_{report_type}_{start_date}_{end_date}.parquet')

In [42]:
cancels.drop(['date'],axis=1,inplace=True)
cancels.rename(columns={'orderDate':'date'},inplace=True)
standard = ['date','trackingCode','subId','subParam','addtag','ctag','orderId','productId','click_time']

In [43]:
# Merging DataFrames with left join and indicator
merged_df = pd.merge(orders, cancels, on=standard, how='left', indicator=True)

# Filtering records only in orders
only_in_order = merged_df[merged_df['_merge'] == 'left_only']
both_in = merged_df[merged_df['_merge'] == 'both']

# Printing lengths for verification
print(f"Original Order Length: {len(orders)}")
print(f"Merged DataFrame Length: {len(merged_df)}")
print(f"Only in Order DataFrame Length: {len(only_in_order)}")
print(f"Both in DataFrames Length: {len(both_in)}")

# Removing records in orders that are present in cancels
# No need to drop duplicates since we're already filtering out canceled records
orders_cleaned = only_in_order.drop(columns=['_merge'])

print(f"Length after removing records present in cancels: {len(orders_cleaned)}")

# Display cleaned orders
print("Cleaned Orders DataFrame:")
orders_cleaned


Original Order Length: 13948
Merged DataFrame Length: 13949
Only in Order DataFrame Length: 12986
Both in DataFrames Length: 963
Length after removing records present in cancels: 12986
Cleaned Orders DataFrame:


Unnamed: 0,date,trackingCode,subId,subParam,addtag,ctag,orderId,productId,productName_x,quantity_x,gmv_x,commissionRate_x,commission_x,click_time,productName_y,quantity_y,gmv_y,commissionRate_y,commission_y
0,20240605,AF3155932,homecoin,04e707e1-1838-49ae-8bb5-c2323d533f59,460,7655477808,8694951045273446,7260513907,AHC 마스터즈 프로 패치 8g + 선크림 SPF50+ PA++++ 1.5ml 4세트,2,19800.0,4.5,891.0,2024-06-04 20:50,,,,,
2,20240605,AF3155932,homecoin,dc07099d-c1e8-4e19-af46-051db1d3c71b,460,7764323341,8644954645994698,2006473477,캐럿 남성용 슬림핏 드레스 셔츠,2,28420.0,4.5,1279.0,2024-06-04 23:52,,,,,
3,20240605,AF3155932,homecoin,9e969158-9f55-4166-8820-2a292a564c2e,400,6714055704,8844956344521020,7289736989,필립스 클래식 미니 건식 다리미 휴대용,1,17210.0,4.5,775.0,2024-06-04 23:17,,,,,
4,20240605,AF3155932,homecoin,2aaf9740-cd24-4ca2-b87c-a8e0cea92c33,460,7308220678,8084955245033329,8008334616,남성 카라 반팔티 1+1 면 스판 반팔티 7색상,1,32900.0,4.5,1481.0,2024-06-05 18:26,,,,,
5,20240605,AF3155932,homecoin,573bac32-41d8-4a4e-b6dc-e202c80c254d,460,8110668107,8674950643124622,7798071494,라비킷 쏙터블 캡슐 표백제 30p,1,18900.0,4.5,851.0,2024-06-04 19:02,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13944,20240611,AF3155932,homecoin,1e7ba297-85eb-4e7d-b2a8-e2ee976fb53c,460,7262459380,8944950143591863,7844797625,[동아] 샤프심 502 골드 XQ 0.5 mm (12개1세트),1,9900.0,4.5,446.0,2024-06-10 22:34,,,,,
13945,20240611,AF3155932,homecoin,ea9a4d84-316a-4b2c-90f0-ec02b1e1efd0,460,7038366091,8354954449555324,1736237896,핑크퐁 여아용 삼각팬티 5종 세트,1,10860.0,4.5,489.0,2024-06-11 22:05,,,,,
13946,20240611,AF3155932,homecoin,3a4a0c5f-9510-4c8c-bc15-1d6608237915,400,7657707867,8084955649974040,7279222039,[1+1] 스타일아유 여성용 프린팅 DREAM 빅레터링 오버핏 반팔 티셔츠[AYG6...,1,14900.0,4.5,671.0,2024-06-10 23:23,,,,,
13947,20240611,AF3155932,homecoin,cdd5ba7d-9e67-421e-93eb-99d6051b38a3,460,7633172276,8214950449665225,6553192715,글루타치온 영양제 대용량 1000mg,1,13500.0,4.5,608.0,2024-06-11 06:20,,,,,


In [66]:
orders_cleaned = orders_cleaned.iloc[:,0:14]
orders_cleaned.to_csv(rf'C:\Users\Owner\Documents\{start_date}_{end_date}_coupang_dynamic.csv',encoding='utf-8')

In [1]:
orders_cleaned.to_excel(rf'C:\Users\Owner\Documents\{start_date}_{end_date}_coupang_dynamic.xlsx',engine='openpyxl',index=False)

NameError: name 'orders_cleaned' is not defined