# 지표 탐색 및 파이프라인 구축하기
---

### 목표
1. 지표 탐색(AARRR)
2. 파이프라인 구축을 위한 코드 작성 및 탐색


## 필요 라이브러리 정리
---
#### 1. 라이브러리 불러오기


In [1]:
## 1.필요 라이브러리 정리 
import pandas as pd

# object 형식으로 저장된 변수를 list 형식으로 바꾸기 위해서
import ast

# GCS 파일 경로에서 데이터 불러오기 위한 라이브러리
from google.cloud import storage
from google.oauth2 import service_account

# 시각화를 위한 라이브러리
import matplotlib.pyplot as plt
import koreanize_matplotlib
import seaborn as sns
import matplotlib.dates as mdates

### 2. 데이터 불러오기 실행 확인

In [2]:
## 2. GCS에서 데이터 불러오기
df = pd.read_parquet(
    "gs://my-advanced_data-bucket/votes/accounts_timelinereport.parquet",
    storage_options={"token": "/home/user/codeit_project/codeit-project-docker/config/key.json"}
)

display(df.head())

Unnamed: 0,id,reason,created_at,reported_user_id,user_id,user_question_record_id
0,28,타인을 사칭함,2023-05-06 04:44:57,874587,885082,3920588
1,37,친구를 비하하거나 조롱하는 어투,2023-05-06 05:41:19,881048,881298,4018679
2,46,친구를 비하하거나 조롱하는 어투,2023-05-06 06:26:59,887882,881945,4120914
3,64,허위 사실 언급,2023-05-06 07:07:46,888610,893684,4143049
4,65,허위 사실 언급,2023-05-06 07:07:52,888610,893684,4143049


## 데이터 가져오기
---
### 1. Votes 데이터 가져오기
### 2. Votes 데이터 내 전처리
### 3. Hackle 데이터 가져오기
### 4. Hackle 데이터 내 전처리

In [5]:
# 인증
key_path = "/home/user/codeit_project/codeit-project-docker/config/key.json"
credentials = service_account.Credentials.from_service_account_file(key_path)

# GCS 클라이언트 생성
client = storage.Client(credentials=credentials)
bucket = client.bucket("my-advanced_data-bucket")

# 'votes/' 경로에 있는 .parquet 파일 리스트 가져오기
vote_blobs = bucket.list_blobs(prefix="votes/")
vote_file_list = [f"gs://my-advanced_data-bucket/{blob.name}" for blob in vote_blobs if blob.name.endswith(".parquet")]

# 결과 확인
print("Votes files in GCS:")
print(vote_file_list)
print()

# 'hackle/' 경로에 있는 .parquet 파일 리스트 가져오기
hackle_blobs = bucket.list_blobs(prefix="hackle/")
hackle_file_list = [f"gs://my-advanced_data-bucket/{blob.name}" for blob in hackle_blobs if blob.name.endswith(".parquet")]

# 결과 확인
print("Hackle files in GCS:")
print(hackle_file_list)


Votes files in GCS:
['gs://my-advanced_data-bucket/votes/accounts_attendance.parquet', 'gs://my-advanced_data-bucket/votes/accounts_blockrecord.parquet', 'gs://my-advanced_data-bucket/votes/accounts_failpaymenthistory.parquet', 'gs://my-advanced_data-bucket/votes/accounts_friendrequest.parquet', 'gs://my-advanced_data-bucket/votes/accounts_group.parquet', 'gs://my-advanced_data-bucket/votes/accounts_nearbyschool.parquet', 'gs://my-advanced_data-bucket/votes/accounts_paymenthistory.parquet', 'gs://my-advanced_data-bucket/votes/accounts_pointhistory.parquet', 'gs://my-advanced_data-bucket/votes/accounts_school.parquet', 'gs://my-advanced_data-bucket/votes/accounts_timelinereport.parquet', 'gs://my-advanced_data-bucket/votes/accounts_user.parquet', 'gs://my-advanced_data-bucket/votes/accounts_user_contacts.parquet', 'gs://my-advanced_data-bucket/votes/accounts_userquestionrecord.parquet', 'gs://my-advanced_data-bucket/votes/accounts_userwithdraw.parquet', 'gs://my-advanced_data-bucket/vot

In [6]:
# 결과 저장용 딕셔너리
df_dict = {}

# vote 내 각 파일을 불러와서 딕셔너리에 저장
for title in vote_file_list:
    gcs_path = title
    df_file_name = title.split('/')[-1].replace('.parquet', '')
    
    # 데이터 불러오기
    df = pd.read_parquet(gcs_path, storage_options={"token": key_path})
    
    # 딕셔너리에 저장
    df_dict[df_file_name] = df

    # 출력
    print(f"{df_file_name} 데이터 프레임 확인")
    print("="*50)
    print(df.head())
    print(f"{df_file_name} 데이터 프레임 크기 : {df.shape}")
    print()
    print(f"{df_file_name} 기본 정보 확인")
    df.info()
    print("\n" + "-"*70 + "\n")
    
# hackle 내 각 파일을 불러와서 딕셔너리에 저장
for title in hackle_file_list:
    gcs_path = title
    df_file_name = title.split('/')[-1].replace('.parquet', '')
    
    # 데이터 불러오기
    df = pd.read_parquet(gcs_path, storage_options={"token": key_path})
    
    # 딕셔너리에 저장
    df_dict[df_file_name] = df

    # 출력
    print(f"{df_file_name} 데이터 프레임 확인")
    print("="*50)
    print(df.head())
    print(f"{df_file_name} 데이터 프레임 크기 : {df.shape}")
    print()
    print(f"{df_file_name} 기본 정보 확인")
    df.info()
    print("\n" + "-"*70 + "\n")

accounts_attendance 데이터 프레임 확인
   id                               attendance_date_list  user_id
0   1  ["2023-05-27", "2023-05-28", "2023-05-29", "20...  1446852
1   2  ["2023-05-27", "2023-05-29", "2023-05-30", "20...  1359398
2   3  ["2023-05-27", "2023-05-29", "2023-05-30", "20...  1501542
3   4  ["2023-05-27", "2023-05-28", "2023-05-29", "20...  1507767
4   5  ["2023-05-27", "2023-05-28", "2023-05-29", "20...  1287453
accounts_attendance 데이터 프레임 크기 : (349637, 3)

accounts_attendance 기본 정보 확인
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 349637 entries, 0 to 349636
Data columns (total 3 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   id                    349637 non-null  int64 
 1   attendance_date_list  349637 non-null  object
 2   user_id               349637 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 8.0+ MB

----------------------------------------------------------------------

accounts_b

In [7]:
df_dict["device_properties"].head()

Unnamed: 0,id,device_id,device_model,device_vendor
0,1,000007C9-E103-4EB5-9777-A9084D4952DF,"iPhone14,7",Apple
1,2,00002245-458F-4CDD-8533-B448CD43DBD2,"iPhone14,7",Apple
2,3,00012620-313A-4502-9F8D-8DAB7443215B,"iPhone14,5",Apple
3,4,000137bc-80de-4bb5-b61d-df7f217a4501,SM-F711N,samsung
4,5,000227D6-B782-4367-91C4-486B76DF9E37,"iPhone12,3",Apple


In [22]:
df_dict["user_properties"].head()

print(df_dict["user_properties"].user_id.duplicated().sum())

# user_properties의 user_id가 존재하고 중복값이 존재하지 않음

display(df_dict["hackle_properties"].head())

## 각 id의 중복값은

print(df_dict["hackle_properties"].user_id.duplicated().sum())
print(df_dict["hackle_properties"].device_id.duplicated().sum())
print(df_dict["hackle_properties"].session_id.duplicated().sum())

## 한 명의 유저를 특정하기 위헤서 user_id를 사용해야할까? 아니면 어떤 값을 사용해야할까?
print("\n각 id의 고유값 개수 확인")
print(df_dict["hackle_properties"].user_id.nunique())
print(df_dict["hackle_properties"].device_id.nunique())
print(df_dict["hackle_properties"].session_id.nunique())

0


Unnamed: 0,id,session_id,user_id,language,osname,osversion,versionname,device_id
0,1,4OzYh3seq3VKytpSn5pvQkZNQii1,,ko-KR,iOS,16.0,2.0.0,590E7C79-CBA0-44D8-8BE3-3C9BFABBBC74
1,2,8QXy31PQxbW9qLzq0Y1dhR8Ypm52,1046711.0,ko-KR,iOS,16.5.1,2.0.3,D5417226-F71B-4A9E-A180-CD072F2AB279
2,3,6bcea65d-9f40-46fc-888c-700fe707483f,1545130.0,ko,Android,13,2.0.5,6bcea65d-9f40-46fc-888c-700fe707483f
3,4,XVYNT6zfhFWqIg9omwg2AHDjTLx2,1224793.0,ko,Android,13,2.0.5,a05c1595-3e05-434b-8684-218b528bd725
4,5,XFB2SPiGfjbVhvJ3Q3DBsaT3m2B3,1329450.0,ko-US,iOS,16.5.1,2.0.5,EAC6C0B3-7CE8-40EA-8A91-9977C0BA5EF3


197969
273630
271734

각 id의 고유값 개수 확인
327381
251720
253616


In [30]:
# 한 사람을 특정하기 위해서 user_id에 따른 세션의 수
display(df_dict["hackle_properties"].groupby("user_id")["session_id"].nunique().sort_values(ascending=False).head(10))

# 
df_dict["hackle_properties"].groupby("user_id")["device_id"].nunique().sort_values(ascending=False).head(10)

user_id
           78197
1459833       13
1571506       13
1578652       13
1239584       12
1285353       12
855879        12
910934        11
1388873       11
1454994       10
Name: session_id, dtype: int64

user_id
           77445
1571506       13
1459833       13
1578652       13
855879        12
1285353       12
1388873       11
910934        11
1239584       11
1454994       10
Name: device_id, dtype: int64

In [None]:
# 예시 확인인
df_dict["hackle_properties"][df_dict["hackle_properties"].user_id == "1459833"].head()

Unnamed: 0,id,session_id,user_id,language,osname,osversion,versionname,device_id
2255,2256,E7291C7D-54D9-459B-B93D-EB3DB0B5EB54,1459833,ko-KR,iOS,16.3.1,2.0.5,E7291C7D-54D9-459B-B93D-EB3DB0B5EB54
24115,24116,EF35B630-C9C9-4CEF-B70A-C5EB2703C17F,1459833,ko-KR,iOS,16.3.1,2.0.5,EF35B630-C9C9-4CEF-B70A-C5EB2703C17F
30217,30218,3CF3AF75-ECF6-4A07-A4F3-66EE0D114428,1459833,ko-KR,iOS,16.3.1,2.0.5,3CF3AF75-ECF6-4A07-A4F3-66EE0D114428
61146,61147,91AC528E-4CE9-4E3E-9283-98DD3F0E40C1,1459833,ko-KR,iOS,16.3.1,2.0.5,91AC528E-4CE9-4E3E-9283-98DD3F0E40C1
64455,64456,4BC78F41-A9F2-400E-AEC6-1C9449163BC2,1459833,ko-KR,iOS,16.3.1,2.0.3,4BC78F41-A9F2-400E-AEC6-1C9449163BC2


In [None]:
# 유저 아이디에 따른 세션 수와 device 수를 데이터 프레임으로 제작 후 확인
user_session_device_df = df_dict["hackle_properties"].groupby("user_id").agg({
    "session_id": "nunique",
    "device_id": "nunique"
}).reset_index()
user_session_device_df.columns = ["user_id", "session_count", "device_count"]
user_session_device_df.sort_values(by="session_count", ascending=False, inplace=True)
display(user_session_device_df.head())

# session_count에 따른 수 확인
user_session_device_df["session_count"].value_counts().sort_index()


Unnamed: 0,user_id,session_count,device_count
0,,78197,77445
158765,1459833.0,13,13
192534,1571506.0,13,13
195273,1578652.0,13,13
88523,1239584.0,12,11


session_count
1        325065
2          2062
3           146
4            44
5            19
6            15
7             8
8             7
9             5
10            1
11            2
12            3
13            3
78197         1
Name: count, dtype: int64

In [58]:
display(df_dict["hackle_properties"].head())
df_dict["hackle_properties"].iloc[0].user_id

# user_id가 공백으로 되어있는 경우 제거
hackle_properties_not_null_user_id = df_dict["hackle_properties"][df_dict["hackle_properties"].user_id != ""]
hackle_properties_not_null_user_id

Unnamed: 0,id,session_id,user_id,language,osname,osversion,versionname,device_id
0,1,4OzYh3seq3VKytpSn5pvQkZNQii1,,ko-KR,iOS,16.0,2.0.0,590E7C79-CBA0-44D8-8BE3-3C9BFABBBC74
1,2,8QXy31PQxbW9qLzq0Y1dhR8Ypm52,1046711.0,ko-KR,iOS,16.5.1,2.0.3,D5417226-F71B-4A9E-A180-CD072F2AB279
2,3,6bcea65d-9f40-46fc-888c-700fe707483f,1545130.0,ko,Android,13,2.0.5,6bcea65d-9f40-46fc-888c-700fe707483f
3,4,XVYNT6zfhFWqIg9omwg2AHDjTLx2,1224793.0,ko,Android,13,2.0.5,a05c1595-3e05-434b-8684-218b528bd725
4,5,XFB2SPiGfjbVhvJ3Q3DBsaT3m2B3,1329450.0,ko-US,iOS,16.5.1,2.0.5,EAC6C0B3-7CE8-40EA-8A91-9977C0BA5EF3


Unnamed: 0,id,session_id,user_id,language,osname,osversion,versionname,device_id
1,2,8QXy31PQxbW9qLzq0Y1dhR8Ypm52,1046711,ko-KR,iOS,16.5.1,2.0.3,D5417226-F71B-4A9E-A180-CD072F2AB279
2,3,6bcea65d-9f40-46fc-888c-700fe707483f,1545130,ko,Android,13,2.0.5,6bcea65d-9f40-46fc-888c-700fe707483f
3,4,XVYNT6zfhFWqIg9omwg2AHDjTLx2,1224793,ko,Android,13,2.0.5,a05c1595-3e05-434b-8684-218b528bd725
4,5,XFB2SPiGfjbVhvJ3Q3DBsaT3m2B3,1329450,ko-US,iOS,16.5.1,2.0.5,EAC6C0B3-7CE8-40EA-8A91-9977C0BA5EF3
5,6,LztzUUFoRxdqTSPgQrX3MAAyNkM2,LztzUUFoRxdqTSPgQrX3MAAyNkM2,ko-KR,iOS,16.1,2.0.5,3F199073-9390-4137-B0B0-0DC4FC103009
...,...,...,...,...,...,...,...,...
525344,525345,b82eptestoYIkel7zGItYz9XqF43,902597,ko-KR,iOS,16.0,2.0.3,B59DCE74-59FB-4417-9DDB-F9B620D71DFC
525345,525346,KlGJxOfY4XdbxnwzPckMh4NdwBk2,1373831,ko-KR,iOS,16.5.1,2.0.5,2EB9127D-703A-495B-8A1E-6667ACA9E724
525346,525347,HGxbSi2oq4MdFVGdQx2UH3f9Aq73,1043127,en-KR,iOS,16.1.2,2.0.3,4400D84D-0353-49C9-818E-6A45D54F1039
525348,525349,gQ5GvGk7kGWwnQbOzQ8fxseQp8B2,7m8IwV5H3aaxr1bdPbbkvvJMvtf2,ko,Android,12,1.2.15,eca30324-3b48-41a6-9628-ff663896dd23


In [59]:
# 유저 id 없는 경우 빼고서 session_count 차이

user_session_device_df_no_null = hackle_properties_not_null_user_id.groupby("user_id").agg({
    "session_id": "nunique",
    "device_id": "nunique"
}).reset_index()
user_session_device_df_no_null.columns = ["user_id", "session_count", "device_count"]
user_session_device_df_no_null.sort_values(by="session_count", ascending=False, inplace=True)
display(user_session_device_df_no_null.head())

# session_count에 따른 수 확인
user_session_device_df_no_null["session_count"].value_counts().sort_index()

Unnamed: 0,user_id,session_count,device_count
158764,1459833,13,13
195272,1578652,13,13
192533,1571506,13,13
103927,1285353,12,12
88522,1239584,12,11


session_count
1     325065
2       2062
3        146
4         44
5         19
6         15
7          8
8          7
9          5
10         1
11         2
12         3
13         3
Name: count, dtype: int64