In [1]:
import pandas as pd
import os
from glob import glob
import platform
import matplotlib.pyplot as plt
import koreanize_matplotlib
import matplotlib.font_manager as fm
import seaborn as sns
from functools import reduce
import ast

# 마이너스 기호 깨짐 방지
plt.rcParams['axes.unicode_minus'] = False

# pandas 전체 출력 형식 설정(지수 표기 -> 자연수 표기)
pd.set_option('display.float_format', '{:,.0f}'.format)

pd.set_option('display.max_columns',100) # pandas 내 모든 칼럼 표시
pd.set_option('display.max_rows', None) # pandas 내 모든 행 표시
pd.set_option('display.max_seq_items', None) # 출력 옵션 설정
pd.set_option('display.max_colwidth', None) # pandas 최대 열 너비 늘리기기

In [2]:
accounts_attendance_df = pd.read_parquet('./data/votes/accounts_attendance.parquet') # 친구 수에 따라 출석율이 다른지
accounts_friendrequest_df = pd.read_parquet('./data/votes/accounts_friendrequest.parquet') # 얼마나 친구 요청을 했는지 / 받았는지
accounts_paymenthistory_df = pd.read_parquet('./data/votes/accounts_paymenthistory.parquet') # 얼마나 결제했는지
accounts_timelinereport_df = pd.read_parquet('./data/votes/accounts_timelinereport.parquet') # 친구 수에 따른 신고율??
accounts_user_contacts_df = pd.read_parquet('./data/votes/accounts_user_contacts.parquet') # 초대를 얼마나 했는지?

In [3]:
accounts_user_df = pd.read_parquet('./data/votes/accounts_user.parquet') # id 컬럼이 다른 테이블의 user_id와 같음.
accounts_userquestionrecord_df = pd.read_parquet('./data/votes/accounts_userquestionrecord.parquet') # id 컬럼이 다른 테이블의 user_id와 같음.
event_receipts_df = pd.read_parquet('./data/votes/event_receipts.parquet') # 이벤트 참여를 여러번 했는지 확인

In [4]:
event_receipts_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 309 entries, 0 to 308
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   id          309 non-null    int64         
 1   created_at  309 non-null    datetime64[ns]
 2   event_id    309 non-null    int64         
 3   user_id     309 non-null    int64         
 4   plus_point  309 non-null    int64         
dtypes: datetime64[ns](1), int64(4)
memory usage: 12.2 KB


In [5]:
event_receipts_df.head()

Unnamed: 0,id,created_at,event_id,user_id,plus_point
0,2,2023-06-22 09:25:16,1,1193618,500
1,3,2023-06-22 09:38:53,1,928351,500
2,4,2023-06-22 10:32:15,1,904872,500
3,5,2023-06-22 13:03:06,1,974697,500
4,6,2023-06-22 13:40:38,1,1168260,500


In [6]:
event_receipts_df['user_id'].nunique()

308

In [7]:
event_receipts_df[event_receipts_df.duplicated('user_id', keep=False)]
# user_id = 1577954 기억하기.

Unnamed: 0,id,created_at,event_id,user_id,plus_point
75,80,2023-07-10 16:51:17,1,1577954,500
76,81,2023-07-10 16:57:04,1,1577954,500


---
---
## 1. accounts_attendance_df, accounts_friendrequest_df, accounts_paymenthistory_df, accounts_timelinereport_df, accounts_user_contacts_df merge하기

In [8]:
dfs = {
    "accounts_attendance_df": accounts_attendance_df,
    "accounts_friendrequest_df": accounts_friendrequest_df,
    "accounts_paymenthistory_df": accounts_paymenthistory_df,
    "accounts_timelinereport_df": accounts_timelinereport_df,
    "accounts_user_contacts_df": accounts_user_contacts_df
}

for name, df in dfs.items():
    print(f"\n📌 {name} (shape: {df.shape})")
    display(df.head(2))


📌 accounts_attendance_df (shape: (349637, 3))


Unnamed: 0,id,attendance_date_list,user_id
0,1,"[""2023-05-27"", ""2023-05-28"", ""2023-05-29"", ""2023-05-30"", ""2023-06-03"", ""2023-06-06"", ""2023-06-12"", ""2023-06-15"", ""2023-07-10"", ""2023-07-31"", ""2023-09-12"", ""2023-09-14"", ""2023-09-19""]",1446852
1,2,"[""2023-05-27"", ""2023-05-29"", ""2023-05-30"", ""2023-06-02"", ""2023-06-03"", ""2023-06-05"", ""2023-06-07"", ""2023-06-08"", ""2023-06-10"", ""2023-06-11"", ""2023-06-15"", ""2023-06-16"", ""2023-06-17"", ""2023-06-18"", ""2023-06-19"", ""2023-06-20"", ""2023-06-21"", ""2023-06-22"", ""2023-06-23"", ""2023-06-27"", ""2023-07-01"", ""2023-07-04"", ""2023-07-08"", ""2023-07-10"", ""2023-07-15"", ""2023-07-26"", ""2023-08-01"", ""2023-08-02"", ""2023-08-03"", ""2023-08-05"", ""2023-08-14"", ""2023-08-21"", ""2023-08-22""]",1359398



📌 accounts_friendrequest_df (shape: (17147175, 6))


Unnamed: 0,id,status,created_at,updated_at,receive_user_id,send_user_id
0,7,P,2023-04-17 18:29:11,2023-04-17 18:29:11,831962,837521
1,10,A,2023-04-17 18:29:11,2023-04-22 06:02:53,832151,837521



📌 accounts_paymenthistory_df (shape: (95140, 5))


Unnamed: 0,id,productId,phone_type,created_at,user_id
0,6,heart.777,A,2023-05-13 21:28:34,1211127
1,7,heart.777,A,2023-05-13 21:29:39,1151343



📌 accounts_timelinereport_df (shape: (208, 6))


Unnamed: 0,id,reason,created_at,reported_user_id,user_id,user_question_record_id
0,28,타인을 사칭함,2023-05-06 04:44:57,874587,885082,3920588
1,37,친구를 비하하거나 조롱하는 어투,2023-05-06 05:41:19,881048,881298,4018679



📌 accounts_user_contacts_df (shape: (5063, 4))


Unnamed: 0,id,contacts_count,invite_user_id_list,user_id
0,259,30,[],1167696
1,1756,79,[],863169


In [9]:
# 문자열 리스트를 실제 리스트로 바꾸고 길이 계산
accounts_attendance_df['attendance_count'] = accounts_attendance_df['attendance_date_list'].apply(
    lambda x: len(ast.literal_eval(x)) if pd.notnull(x) else 0
)
accounts_attendance_df.drop(columns=['attendance_date_list'], inplace=True)

In [10]:
if 'send_user_id' in accounts_friendrequest_df.columns:
    accounts_friendrequest_df.rename(columns={"send_user_id": "user_id"}, inplace=True)

dfs = [
    df.drop(columns=['id']) if 'id' in df.columns else df # id 컬럼은 각 테이블의 고유 id이기 때문에 제거
    for df in [
        accounts_attendance_df,
        accounts_friendrequest_df,
        accounts_paymenthistory_df,
        accounts_timelinereport_df,
        accounts_user_contacts_df
    ]
]

merged_df = reduce(lambda left, right: pd.merge(left, right, on='user_id', how='outer'), dfs)

In [11]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18290273 entries, 0 to 18290272
Data columns (total 15 columns):
 #   Column                   Dtype         
---  ------                   -----         
 0   user_id                  int64         
 1   attendance_count         float64       
 2   status                   object        
 3   created_at_x             datetime64[ns]
 4   updated_at               datetime64[ns]
 5   receive_user_id          float64       
 6   productId                object        
 7   phone_type               object        
 8   created_at_y             datetime64[ns]
 9   reason                   object        
 10  created_at               datetime64[ns]
 11  reported_user_id         float64       
 12  user_question_record_id  float64       
 13  contacts_count           float64       
 14  invite_user_id_list      object        
dtypes: datetime64[ns](4), float64(5), int64(1), object(5)
memory usage: 2.0+ GB


In [21]:
merged_df.head()

Unnamed: 0,user_id,attendance_count,status,created_at_x,updated_at,receive_user_id,productId,phone_type,created_at_y,reason,created_at,reported_user_id,user_question_record_id,contacts_count,invite_user_id_list
0,1446852,13,A,2023-06-20 12:17:11,2023-06-20 12:17:20,838541,heart.777,I,2023-09-13 16:36:26,,NaT,,,,
1,1446852,13,A,2023-06-20 12:17:11,2023-06-20 12:17:20,838541,heart.777,I,2023-09-13 16:36:26,,NaT,,,,
2,1446852,13,A,2023-06-20 12:17:11,2023-06-20 12:17:20,838541,heart.777,I,2023-09-13 16:36:26,,NaT,,,,
3,1446852,13,A,2023-06-20 12:17:11,2023-06-20 12:17:20,838541,heart.777,I,2023-09-13 16:36:26,,NaT,,,,
4,1446852,13,A,2023-06-20 12:17:11,2023-06-20 12:17:20,838541,heart.777,I,2023-09-13 16:36:26,,NaT,,,,


In [19]:
user_summary = merged_df.groupby('user_id').agg({
    'attendance_count': 'max', 
    'receive_user_id': lambda x: x.nunique(),  # 친구 요청 보낸 유저 수
    'productId': lambda x: x.nunique(),        # 구매한 상품 종류 수
    'reported_user_id': lambda x: x.nunique(), # 신고한 유저 수
    'contacts_count': 'max'                    # 초대한 수 (있다면 최대값)
}).reset_index()

user_summary.rename(columns={
    'receive_user_id': 'unique_friends_sent',
    'productId': 'unique_products_bought',
    'reported_user_id': 'unique_users_reported',
    'contacts_count': 'max_contacts_sent'
}, inplace=True)

In [20]:
user_summary.head()

Unnamed: 0,user_id,attendance_count,unique_friends_sent,unique_products_bought,unique_users_reported,max_contacts_sent
0,831962,,1,0,0,
1,832151,1.0,10,0,0,
2,832340,2.0,26,0,0,
3,832986,1.0,0,0,0,
4,833024,,2,0,0,
