In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import matplotlib.font_manager as fm
import koreanize_matplotlib

# 폰트 경로 확인 및 설정
font_path = '/usr/share/fonts/truetype/nanum/NanumGothic.ttf'  # 설치된 폰트 경로
font_name = fm.FontProperties(fname=font_path).get_name()
plt.rc('font', family=font_name)

# 한글 깨짐 방지 - 마이너스 기호 처리
plt.rcParams['axes.unicode_minus'] = False

In [2]:
# 서비스 계정 키 경로 설정
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = r"/home/eunyu/sprintda05-eunyu-a9946f7dea9c.json"

# GCS의 Parquet 파일 경로
gcs_path = 'gs://final_project_enuyu/data/final_project/votes/accounts_paymenthistory.parquet'

# 데이터 불러오기
paymenthistory_df = pd.read_parquet(gcs_path, engine='pyarrow')
paymenthistory_df

Unnamed: 0,id,productId,phone_type,created_at,user_id
0,6,heart.777,A,2023-05-13 21:28:34,1211127
1,7,heart.777,A,2023-05-13 21:29:39,1151343
2,8,heart.777,A,2023-05-13 21:31:33,1002147
3,9,heart.777,A,2023-05-13 21:31:39,1095040
4,11,heart.777,A,2023-05-13 21:34:32,1164081
...,...,...,...,...,...
95135,98074,heart.777,I,2024-05-06 14:51:27,1086654
95136,98075,heart.1000,I,2024-05-06 16:06:30,871740
95137,98076,heart.1000,I,2024-05-08 06:57:23,1166361
95138,98077,heart.1000,I,2024-05-08 14:12:45,1163480


In [3]:
paymenthistory_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95140 entries, 0 to 95139
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   id          95140 non-null  int64         
 1   productId   95140 non-null  object        
 2   phone_type  95140 non-null  object        
 3   created_at  95140 non-null  datetime64[ns]
 4   user_id     95140 non-null  int64         
dtypes: datetime64[ns](1), int64(2), object(2)
memory usage: 3.6+ MB


In [None]:
paymenthistory_df.isnull().sum()

id            0
productId     0
phone_type    0
created_at    0
user_id       0
dtype: int64

In [5]:
paymenthistory_df.describe(include='all')

Unnamed: 0,id,productId,phone_type,created_at,user_id
count,95140.0,95140,95140,95140,95140.0
unique,,4,2,,
top,,heart.777,I,,
freq,,57873,61632,,
mean,49136.797572,,,2023-05-27 21:47:32.003741952,1256912.0
min,6.0,,,2023-05-13 21:28:34,833041.0
25%,24531.75,,,2023-05-16 11:48:44.750000128,1116654.0
50%,49176.5,,,2023-05-21 12:40:45.500000,1270988.0
75%,73740.25,,,2023-05-27 13:02:52.249999872,1417660.0
max,98078.0,,,2024-05-08 14:12:45,1583632.0


#### 상품별/폰타입별 실패율 비교

In [9]:
paymenthistory_df['productId'].value_counts()


productId
heart.777     57873
heart.1000    19309
heart.200     15822
heart.4000     2136
Name: count, dtype: int64

In [10]:
paymenthistory_df['phone_type'].value_counts()

phone_type
I    61632
A    33508
Name: count, dtype: int64

In [8]:
# 고유 유저 수 (중복 제거)
unique_users_by_phone = paymenthistory_df.groupby('phone_type')['user_id'].nunique()

# 비율 계산
user_ratio_by_phone = (unique_users_by_phone / unique_users_by_phone.sum()) * 100

# 출력
print("폰 타입별 고유 유저 수:")
print(unique_users_by_phone)

print("\n폰 타입별 유저 비율 (%):")
print(user_ratio_by_phone.round(2))

폰 타입별 고유 유저 수:
phone_type
A    21945
I    37303
Name: user_id, dtype: int64

폰 타입별 유저 비율 (%):
phone_type
A    37.04
I    62.96
Name: user_id, dtype: float64


In [7]:
paymenthistory_df['user_id'].value_counts().value_counts().sort_index()

count
1     43049
2      8582
3      3423
4      1664
5       920
6       512
7       302
8       211
9       146
10      100
11       55
12       67
13       30
14       20
15       23
16       18
17       13
18       13
19       10
20        6
21        7
22        4
23        4
24        4
25        2
27        2
30        1
34        1
35        1
51        1
60        1
Name: count, dtype: int64

In [12]:
# 같은 유저가 같은 상품을 몇 번 시도했는지 count
user_product_attempts = paymenthistory_df.groupby(['user_id', 'productId']).size().reset_index(name='attempts')
user_product_attempts

Unnamed: 0,user_id,productId,attempts
0,833041,heart.4000,2
1,834112,heart.777,1
2,835815,heart.200,2
3,835815,heart.777,1
4,835839,heart.777,1
...,...,...,...
79435,1583408,heart.777,3
79436,1583445,heart.200,1
79437,1583450,heart.777,1
79438,1583454,heart.777,2


#### 최대 구매 시도 횟수

In [13]:
user_product_attempts['attempts'].max()

50

In [14]:
user_product_attempts.sort_values(by='attempts', ascending=False).head(1)

Unnamed: 0,user_id,productId,attempts
74725,1527451,heart.200,50
