In [3]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
from datetime import timedelta
import matplotlib.font_manager as fm
import matplotlib as mpl
import ast
import json
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.expand_frame_repr', False)

from collections import Counter
from kiwipiepy import Kiwi


In [None]:
def read_parquet_from_gcs(file_names, gcs_prefix, key_path):
    dfs = {}
    for name in file_names:
        path = f"{gcs_prefix}/{name}.parquet"
        dfs[name] = pd.read_parquet(path, storage_options={"token": key_path})
    return dfs


file_list = [
    "accounts_attendance",
    "accounts_blockrecord",
    "accounts_failpaymenthistory",
    "accounts_friendrequest",
    "accounts_group",
    "accounts_nearbyschool",
    "accounts_paymenthistory",
    "accounts_user_contacts",
    "accounts_pointhistory",
    "accounts_school",
    "accounts_timelinereport",
    "accounts_user",
    "accounts_userquestionrecord",
    "accounts_userwithdraw",
    "event_receipts",
    "events",
    "polls_question",
    "polls_questionpiece",
    "polls_questionreport",
    "polls_questionset",
    "polls_usercandidate"
]

gcs_prefix = "gs://codeit-project/votes"
key_path = "./config/key.json"

dfs = read_parquet_from_gcs(file_list, gcs_prefix, key_path)

# 출석 테이블
accounts_attendance = dfs["accounts_attendance"]

# 차단 기록 테이블
accounts_blockrecord = dfs["accounts_blockrecord"]

# 상품 구매 실패 기록 테이블
accounts_failpaymenthistory = dfs["accounts_failpaymenthistory"]

# 친구 요청 테이블
accounts_friendrequest = dfs["accounts_friendrequest"]

# 그룹 테이블
accounts_group = dfs["accounts_group"]

# 근처 학교 테이블
accounts_nearbyschool = dfs["accounts_nearbyschool"]

# 결제 기록 테이블
accounts_paymenthistory = dfs["accounts_paymenthistory"]

# 사용자 연락처 테이블
accounts_user_contacts = dfs["accounts_user_contacts"]

# 포인트 내역 테이블
accounts_pointhistory = dfs["accounts_pointhistory"]

# 학교 정보 테이블
accounts_school = dfs["accounts_school"]

# 타임라인 신고 기록 테이블
accounts_timelinereport = dfs["accounts_timelinereport"]

# 사용자 기본 정보 테이블
accounts_user = dfs["accounts_user"]

# 사용자 질문 응답 기록 테이블
accounts_userquestionrecord = dfs["accounts_userquestionrecord"]

# 회원 탈퇴 기록 테이블
accounts_userwithdraw = dfs["accounts_userwithdraw"]

# 이벤트 영수증 테이블
event_receipts = dfs["event_receipts"]

# 이벤트 테이블
events = dfs["events"]

# 질문 테이블
polls_question = dfs["polls_question"]

# 질문 조각 테이블
polls_questionpiece = dfs["polls_questionpiece"]

# 질문 신고 테이블
polls_questionreport = dfs["polls_questionreport"]

# 질문 세트 테이블
polls_questionset = dfs["polls_questionset"]

# 사용자 후보 응답 테이블
polls_usercandidate = dfs["polls_usercandidate"]

In [None]:
accounts_attendance.head()

Unnamed: 0,id,attendance_date_list,user_id
0,1,"[""2023-05-27"", ""2023-05-28"", ""2023-05-29"", ""20...",1446852
1,2,"[""2023-05-27"", ""2023-05-29"", ""2023-05-30"", ""20...",1359398
2,3,"[""2023-05-27"", ""2023-05-29"", ""2023-05-30"", ""20...",1501542
3,4,"[""2023-05-27"", ""2023-05-28"", ""2023-05-29"", ""20...",1507767
4,5,"[""2023-05-27"", ""2023-05-28"", ""2023-05-29"", ""20...",1287453


In [None]:
accounts_failpaymenthistory.head()

Unnamed: 0,id,productId,phone_type,created_at,user_id
0,6,heart.200,A,2023-05-14 05:49:22,1055891
1,7,heart.777,A,2023-05-14 08:17:21,1152151
2,8,heart.777,A,2023-05-14 10:11:46,986200
3,9,heart.1000,A,2023-05-14 11:53:09,1028261
4,10,heart.777,A,2023-05-14 12:30:47,1235730


In [None]:
accounts_failpaymenthistory.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 163 entries, 0 to 162
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   id          163 non-null    int64         
 1   productId   56 non-null     object        
 2   phone_type  163 non-null    object        
 3   created_at  163 non-null    datetime64[ns]
 4   user_id     163 non-null    int64         
dtypes: datetime64[ns](1), int64(2), object(2)
memory usage: 6.5+ KB


In [None]:
accounts_failpaymenthistory.isna().sum()

id              0
productId     107
phone_type      0
created_at      0
user_id         0
dtype: int64

In [None]:
accounts_friendrequest.head()

Unnamed: 0,id,status,created_at,updated_at,receive_user_id,send_user_id
0,7,P,2023-04-17 18:29:11,2023-04-17 18:29:11,831962,837521
1,10,A,2023-04-17 18:29:11,2023-04-22 06:02:53,832151,837521
2,11,A,2023-04-17 18:29:11,2023-04-18 19:28:41,832340,837521
3,13,A,2023-04-17 18:29:11,2023-04-19 11:05:04,833041,837521
4,20,P,2023-04-17 18:29:11,2023-04-17 18:29:11,834415,837521


In [None]:
# 중복행 탐색
accounts_friendrequest.duplicated().sum()

np.int64(0)

In [None]:
accounts_group[accounts_group['school_id'] == 4383].sort_values(by  = 'grade', ascending = True)

Unnamed: 0,id,grade,class_num,school_id
40592,40614,1,2,4383
54375,54400,1,3,4383
58686,58711,1,4,4383
70593,70619,1,5,4383
70614,70640,1,6,4383
76437,76464,1,4,4383
71595,71621,2,5,4383
68448,68474,2,1,4383
75729,75756,2,3,4383
75559,75586,2,4,4383


In [None]:
accounts_user.head()

Unnamed: 0,id,is_superuser,is_staff,gender,point,friend_id_list,is_push_on,created_at,block_user_id_list,hide_user_id_list,ban_status,report_count,alarm_count,pending_chat,pending_votes,group_id
0,831956,1,1,,600,"[1292473, 913158, 1488461, 1064695, 1043565, 1...",0,2023-03-29 03:44:14.047130,[],[],N,0,0,0,0,
1,831962,0,0,F,2248,"[833025, 832642, 982531, 879496, 838541, 83752...",1,2023-03-29 05:18:56.162368,[],[],N,253,40878,5499,110,12.0
2,832151,0,0,M,1519,"[838785, 982531, 882567, 879496, 838541, 83649...",0,2023-03-29 12:56:34.989468,[],[],N,0,37,0,47,1.0
3,832340,0,0,F,57,"[841345, 982531, 838785, 963714, 882567, 83252...",1,2023-03-29 12:56:35.020790,[],[],N,0,19,0,21,1.0
4,832520,0,0,M,1039,"[874050, 849763, 874212, 844297, 838541, 84004...",0,2023-03-29 12:56:35.049311,[],[],N,0,29,0,15,12.0


In [None]:
accounts_userquestionrecord.head()

Unnamed: 0,id,status,created_at,chosen_user_id,question_id,user_id,question_piece_id,has_read,answer_status,answer_updated_at,report_count,opened_times
0,771777,C,2023-04-28 12:27:49,849469,252,849436,998458,0,N,2023-04-28 12:27:49,0,0
1,771800,C,2023-04-28 12:28:02,849446,244,849436,998459,0,N,2023-04-28 12:28:02,0,0
2,771812,C,2023-04-28 12:28:09,849454,183,849436,998460,1,N,2023-04-28 12:28:09,0,0
3,771828,C,2023-04-28 12:28:16,847375,101,849436,998461,0,N,2023-04-28 12:28:16,0,0
4,771851,C,2023-04-28 12:28:26,849477,209,849436,998462,1,N,2023-04-28 12:28:26,0,0


In [None]:
accounts_userquestionrecord.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1217558 entries, 0 to 1217557
Data columns (total 12 columns):
 #   Column             Non-Null Count    Dtype         
---  ------             --------------    -----         
 0   id                 1217558 non-null  int64         
 1   status             1217558 non-null  object        
 2   created_at         1217558 non-null  datetime64[ns]
 3   chosen_user_id     1217558 non-null  int64         
 4   question_id        1217558 non-null  int64         
 5   user_id            1217558 non-null  int64         
 6   question_piece_id  1217558 non-null  int64         
 7   has_read           1217558 non-null  int64         
 8   answer_status      1217558 non-null  object        
 9   answer_updated_at  1217558 non-null  datetime64[ns]
 10  report_count       1217558 non-null  int64         
 11  opened_times       1217558 non-null  int64         
dtypes: datetime64[ns](2), int64(8), object(2)
memory usage: 111.5+ MB


In [None]:
accounts_userquestionrecord[accounts_userquestionrecord['user_id'] == 849436]