In [1]:
import pandas as pd
import os
from glob import glob
import platform
import matplotlib.pyplot as plt
import koreanize_matplotlib
import matplotlib.font_manager as fm
import seaborn as sns
import ast

# 마이너스 기호 깨짐 방지
plt.rcParams['axes.unicode_minus'] = False

# pandas 전체 출력 형식 설정(지수 표기 -> 자연수 표기)
pd.set_option('display.float_format', '{:,.0f}'.format)

pd.set_option('display.max_columns',100) # pandas 내 모든 칼럼 표시
pd.set_option('display.max_rows', None) # pandas 내 모든 행 표시
pd.set_option('display.max_seq_items', None) # 출력 옵션 설정
pd.set_option('display.max_colwidth', None) # pandas 최대 열 너비 늘리기기

In [2]:
polls_questionreport_df = pd.read_parquet("./data/votes/polls_questionreport.parquet")
polls_questionreport_df.describe()

Unnamed: 0,id,created_at,question_id,user_id
count,51424,51424,51424,51424
mean,28027,2023-05-21 22:12:41.641120768,733,1168247
min,1,2023-04-19 06:20:35,99,832340
25%,14021,2023-05-13 01:36:21.249999872,305,978481
50%,28048,2023-05-18 14:33:45,494,1160251
75%,42084,2023-05-26 08:45:25,1021,1351295
max,55767,2024-05-05 14:56:25,5110,1583634
std,16163,,666,208247


---
---
# 1. polls 데이터 모아보기

---
---
## 1.1 polls 기본정보

In [2]:
# 폴더 경로
folder_path = './data/votes/'

# polls_로 시작하고 parquet 확장자인 파일 목록 불러오기
parquet_files = glob(os.path.join(folder_path, 'polls_*.parquet'))

# 요약 출력 함수
def show_df_summary(df, name):
    print(f"\n===== {name}.info() =====")
    df.info()
    print(f"\n===== {name}.head(2) =====")
    display(df.head(2))
    
    # ✅ created_at 범위 확인
    if 'created_at' in df.columns:
        min_date = df['created_at'].min()
        max_date = df['created_at'].max()
        print(f"\n🕒 {name} - created_at range: {min_date} → {max_date}")

# 모든 parquet 파일 순회
for file_path in parquet_files:
    file_name = os.path.basename(file_path)
    try:
        df = pd.read_parquet(file_path)
        show_df_summary(df, file_name)
    except Exception as e:
        print(f"❌ Failed to read {file_name}: {e}")


===== polls_questionpiece.parquet.info() =====
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1265476 entries, 0 to 1265475
Data columns (total 5 columns):
 #   Column       Non-Null Count    Dtype         
---  ------       --------------    -----         
 0   id           1265476 non-null  int64         
 1   is_voted     1265476 non-null  int64         
 2   created_at   1265476 non-null  datetime64[ns]
 3   question_id  1265476 non-null  int64         
 4   is_skipped   1265476 non-null  int64         
dtypes: datetime64[ns](1), int64(4)
memory usage: 48.3 MB

===== polls_questionpiece.parquet.head(2) =====


Unnamed: 0,id,is_voted,created_at,question_id,is_skipped
0,998458,1,2023-04-28 12:27:22,252,0
1,998459,1,2023-04-28 12:27:22,244,0



🕒 polls_questionpiece.parquet - created_at range: 2023-04-28 12:27:22 → 2024-05-07 11:32:30

===== polls_usercandidate.parquet.info() =====
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4769609 entries, 0 to 4769608
Data columns (total 4 columns):
 #   Column             Dtype         
---  ------             -----         
 0   id                 int64         
 1   created_at         datetime64[ns]
 2   question_piece_id  int64         
 3   user_id            int64         
dtypes: datetime64[ns](1), int64(3)
memory usage: 145.6 MB

===== polls_usercandidate.parquet.head(2) =====


Unnamed: 0,id,created_at,question_piece_id,user_id
0,3088872,2023-04-28 12:27:49,998458,849444
1,3088873,2023-04-28 12:27:49,998458,849454



🕒 polls_usercandidate.parquet - created_at range: 2023-04-28 12:27:49 → 2024-05-08 01:36:18

===== polls_question.parquet.info() =====
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5025 entries, 0 to 5024
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   id             5025 non-null   int64         
 1   question_text  5025 non-null   object        
 2   created_at     5025 non-null   datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 117.9+ KB

===== polls_question.parquet.head(2) =====


Unnamed: 0,id,question_text,created_at
0,99,가장 신비한 매력이 있는 사람은?,2023-03-31 15:22:53
1,100,"""이 사람으로 한 번 살아보고 싶다"" 하는 사람은?",2023-03-31 15:22:53



🕒 polls_question.parquet - created_at range: 2023-03-31 15:22:53 → 2023-06-06 06:15:52

===== polls_questionset.parquet.info() =====
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158384 entries, 0 to 158383
Data columns (total 6 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   id                      158384 non-null  int64         
 1   question_piece_id_list  158384 non-null  object        
 2   opening_time            158384 non-null  datetime64[ns]
 3   status                  158384 non-null  object        
 4   created_at              158384 non-null  datetime64[ns]
 5   user_id                 158384 non-null  int64         
dtypes: datetime64[ns](2), int64(2), object(2)
memory usage: 7.3+ MB

===== polls_questionset.parquet.head(2) =====


Unnamed: 0,id,question_piece_id_list,opening_time,status,created_at,user_id
0,99817,"[998458, 998459, 998460, 998461, 998462, 998463, 998464, 998465, 998466, 998467]",2023-04-28 12:27:22,F,2023-04-28 12:27:23,849436
1,99830,"[998588, 998589, 998590, 998591, 998592, 998593, 998594, 998595, 998596, 998597]",2023-04-28 12:28:07,F,2023-04-28 12:28:07,849438



🕒 polls_questionset.parquet - created_at range: 2023-04-28 12:27:23 → 2024-05-07 11:32:30

===== polls_questionreport.parquet.info() =====
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51424 entries, 0 to 51423
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   id           51424 non-null  int64         
 1   reason       51424 non-null  object        
 2   created_at   51424 non-null  datetime64[ns]
 3   question_id  51424 non-null  int64         
 4   user_id      51424 non-null  int64         
dtypes: datetime64[ns](1), int64(3), object(1)
memory usage: 2.0+ MB

===== polls_questionreport.parquet.head(2) =====


Unnamed: 0,id,reason,created_at,question_id,user_id
0,1,이 질문은 재미없어요,2023-04-19 06:20:35,250,837556
1,2,이 질문은 재미없어요,2023-04-19 06:58:09,113,837672



🕒 polls_questionreport.parquet - created_at range: 2023-04-19 06:20:35 → 2024-05-05 14:56:25


In [None]:
polls_questionset_df = pd.read_parquet("./data/votes/polls_questionset.parquet")
polls_questionpiece_df = pd.read_parquet("./data/votes/polls_questionpiece.parquet")
polls_question_df = pd.read_parquet("./data/votes/polls_question.parquet")
polls_usercandidate_df = pd.read_parquet("./data/votes/polls_usercandidate.parquet")
polls_questionreport_df = pd.read_parquet("./data/votes/polls_questionreport.parquet")

# Step 1: 리스트 문자열을 실제 리스트로 변환
polls_questionset_df["question_piece_id_list"] = polls_questionset_df["question_piece_id_list"].apply(ast.literal_eval)

# Step 2: explode로 펼치기
polls_questionset_df = polls_questionset_df.explode("question_piece_id_list")

# Step 3: int로 변환
polls_questionset_df["question_piece_id_list"] = polls_questionset_df["question_piece_id_list"].astype(int)

# Step 4: questionset + questionpiece
polls_questionset_questionpiece_df = polls_questionset_df.merge(polls_questionpiece_df, left_on="question_piece_id_list", right_on="id", suffixes=("_qs", "_qp"))

# Step 5: merge question
polls_questionset_questionpiece_df_q = polls_questionset_questionpiece_df.merge(polls_question_df, left_on="question_id", right_on="id", suffixes=("", "_q"))

# Step 6: merge usercandidate
polls_questionset_questionpiece_df_q_uc = polls_questionset_questionpiece_df_q.merge(polls_usercandidate_df, left_on="id_qp", right_on="question_piece_id", suffixes=("", "_uc"))

# Step 7: merge questionreport
final = polls_questionset_questionpiece_df_q_uc.merge(polls_questionreport_df, on=["question_id", "user_id"], how="left")

In [4]:
%xdel polls_questionset_df
%xdel polls_questionpiece_df
%xdel polls_question_df
%xdel polls_usercandidate_df
%xdel polls_questionreport_df

NameError: name 'polls_usercandidate_df' is not defined
NameError: name 'polls_questionreport_df' is not defined


In [5]:
final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4769680 entries, 0 to 4769679
Data columns (total 21 columns):
 #   Column                  Dtype         
---  ------                  -----         
 0   id_qs                   int64         
 1   question_piece_id_list  int64         
 2   opening_time            datetime64[ns]
 3   status                  object        
 4   created_at_qs           datetime64[ns]
 5   user_id                 int64         
 6   id_qp                   int64         
 7   is_voted                int64         
 8   created_at_qp           datetime64[ns]
 9   question_id             int64         
 10  is_skipped              int64         
 11  id_x                    int64         
 12  question_text           object        
 13  created_at_x            datetime64[ns]
 14  id_uc                   int64         
 15  created_at_uc           datetime64[ns]
 16  question_piece_id       int64         
 17  user_id_uc              int64         
 18  id

In [6]:
final.head()

Unnamed: 0,id_qs,question_piece_id_list,opening_time,status,created_at_qs,user_id,id_qp,is_voted,created_at_qp,question_id,is_skipped,id_x,question_text,created_at_x,id_uc,created_at_uc,question_piece_id,user_id_uc,id_y,reason,created_at_y
0,99817,998458,2023-04-28 12:27:22,F,2023-04-28 12:27:23,849436,998458,1,2023-04-28 12:27:22,252,0,252,손이 가장 이쁘게 생겼을거 같은 사람은?,2023-04-01 11:09:27,3088872,2023-04-28 12:27:49,998458,849444,,,NaT
1,99817,998458,2023-04-28 12:27:22,F,2023-04-28 12:27:23,849436,998458,1,2023-04-28 12:27:22,252,0,252,손이 가장 이쁘게 생겼을거 같은 사람은?,2023-04-01 11:09:27,3088873,2023-04-28 12:27:49,998458,849454,,,NaT
2,99817,998458,2023-04-28 12:27:22,F,2023-04-28 12:27:23,849436,998458,1,2023-04-28 12:27:22,252,0,252,손이 가장 이쁘게 생겼을거 같은 사람은?,2023-04-01 11:09:27,3088874,2023-04-28 12:27:49,998458,849460,,,NaT
3,99817,998458,2023-04-28 12:27:22,F,2023-04-28 12:27:23,849436,998458,1,2023-04-28 12:27:22,252,0,252,손이 가장 이쁘게 생겼을거 같은 사람은?,2023-04-01 11:09:27,3088875,2023-04-28 12:27:49,998458,849469,,,NaT
4,100132,1001617,2023-04-28 13:26:53,F,2023-04-28 12:36:53,849762,1001617,1,2023-04-28 12:36:53,252,0,252,손이 가장 이쁘게 생겼을거 같은 사람은?,2023-04-01 11:09:27,3131196,2023-04-28 13:41:41,1001617,849438,,,NaT


In [7]:
final.isna().sum()

id_qs                           0
question_piece_id_list          0
opening_time                    0
status                          0
created_at_qs                   0
user_id                         0
id_qp                           0
is_voted                        0
created_at_qp                   0
question_id                     0
is_skipped                      0
id_x                            0
question_text                   0
created_at_x                    0
id_uc                           0
created_at_uc                   0
question_piece_id               0
user_id_uc                      0
id_y                      4768957
reason                    4768957
created_at_y              4768957
dtype: int64

In [8]:
# drop할 중복 컬럼 목록
columns_to_drop = ["id_qp", "question_piece_id", "id_x"]

# 정리
final = final.drop(columns=columns_to_drop)

In [9]:
final.head()

Unnamed: 0,id_qs,question_piece_id_list,opening_time,status,created_at_qs,user_id,is_voted,created_at_qp,question_id,is_skipped,question_text,created_at_x,id_uc,created_at_uc,user_id_uc,id_y,reason,created_at_y
0,99817,998458,2023-04-28 12:27:22,F,2023-04-28 12:27:23,849436,1,2023-04-28 12:27:22,252,0,손이 가장 이쁘게 생겼을거 같은 사람은?,2023-04-01 11:09:27,3088872,2023-04-28 12:27:49,849444,,,NaT
1,99817,998458,2023-04-28 12:27:22,F,2023-04-28 12:27:23,849436,1,2023-04-28 12:27:22,252,0,손이 가장 이쁘게 생겼을거 같은 사람은?,2023-04-01 11:09:27,3088873,2023-04-28 12:27:49,849454,,,NaT
2,99817,998458,2023-04-28 12:27:22,F,2023-04-28 12:27:23,849436,1,2023-04-28 12:27:22,252,0,손이 가장 이쁘게 생겼을거 같은 사람은?,2023-04-01 11:09:27,3088874,2023-04-28 12:27:49,849460,,,NaT
3,99817,998458,2023-04-28 12:27:22,F,2023-04-28 12:27:23,849436,1,2023-04-28 12:27:22,252,0,손이 가장 이쁘게 생겼을거 같은 사람은?,2023-04-01 11:09:27,3088875,2023-04-28 12:27:49,849469,,,NaT
4,100132,1001617,2023-04-28 13:26:53,F,2023-04-28 12:36:53,849762,1,2023-04-28 12:36:53,252,0,손이 가장 이쁘게 생겼을거 같은 사람은?,2023-04-01 11:09:27,3131196,2023-04-28 13:41:41,849438,,,NaT


In [None]:
# 컬럼명 가독성 있게 바꾸기
final = final.rename(columns={
    "id_qs": "question_set_id",
    "created_at_qs":"question_set_created_at",
    "created_at_qp":"question_piece_created_at",
    "status":"question_set_status",
    "opening_time":"question_set_opening_time",
    'created_at_x':'question_created_at',
    'is_voted':'question_piece_is_voted',
    'is_skipped':'question_piece_is_skipped',
    'id_uc':"user_candidate_id",
    "created_at_uc":"user_candidate_created_at",
    'user_id_uc':'user_candidate_user_id',
    'id_y' : 'question_report_id',
    "reason":"question_report_reason",
    "created_at_y":"question_report_created_at"
})
final.head()

Unnamed: 0,question_set_id,question_piece_id_list,question_set_opening_time,question_set_status,question_set_created_at,user_id,question_piece_is_voted,question_piece_created_at,question_id,question_piece_is_skipped,question_text,question_created_at,user_candidate_id,user_candidate_created_at,user_candidate_user_id,question_report_id,question_report_reason,question_report_created_at
0,99817,998458,2023-04-28 12:27:22,F,2023-04-28 12:27:23,849436,1,2023-04-28 12:27:22,252,0,손이 가장 이쁘게 생겼을거 같은 사람은?,2023-04-01 11:09:27,3088872,2023-04-28 12:27:49,849444,,,NaT
1,99817,998458,2023-04-28 12:27:22,F,2023-04-28 12:27:23,849436,1,2023-04-28 12:27:22,252,0,손이 가장 이쁘게 생겼을거 같은 사람은?,2023-04-01 11:09:27,3088873,2023-04-28 12:27:49,849454,,,NaT
2,99817,998458,2023-04-28 12:27:22,F,2023-04-28 12:27:23,849436,1,2023-04-28 12:27:22,252,0,손이 가장 이쁘게 생겼을거 같은 사람은?,2023-04-01 11:09:27,3088874,2023-04-28 12:27:49,849460,,,NaT
3,99817,998458,2023-04-28 12:27:22,F,2023-04-28 12:27:23,849436,1,2023-04-28 12:27:22,252,0,손이 가장 이쁘게 생겼을거 같은 사람은?,2023-04-01 11:09:27,3088875,2023-04-28 12:27:49,849469,,,NaT
4,100132,1001617,2023-04-28 13:26:53,F,2023-04-28 12:36:53,849762,1,2023-04-28 12:36:53,252,0,손이 가장 이쁘게 생겼을거 같은 사람은?,2023-04-01 11:09:27,3131196,2023-04-28 13:41:41,849438,,,NaT


|컬럼명|설명|
|-----|---|
|user_id|유저 id|
|qusetion_id|질문 id|
|question_text|질문 내용|
|question_set_id|질문 세트 고유 id|
|question_set_status|질문 세트 상태(C:닫힘,O:열림,F:종료)|
|question_set_created_at|질문 세트 만든 시간|
|question_set_opening_time|질문 세트 오픈 시간|
|question_piece_id_list|질문 세트에 속한 질문 id 리스트|
|question_piece_created_at|질문 조각 만든 시간|
|question_piece_is_voted|질문 조각 투표 여부(1 or 0)|
|question_piece_is_skipped|질문 조각 스킵 여부(1 or 0)|
|question_created_at|질문 만든 시간|
|user_candidate_id|질문 조각 테이블 고유 id|
|user_candidate_created_at|질문 조각 보기에 등장한 유저 테이블 만든 시간|
|user_candidate_user_id|질문 조각 보기에 등장한 유저 id|
|question_report_id|질문 신고 id|
|question_report_reason|질문 신고 이유|
|question_report_created_at|질문 신고 만든 시간|

In [None]:
column_order = [
    "user_id",  
    "question_id",  
    "question_text",  
    "question_set_id",  
    "question_set_status",  
    "question_set_created_at",  
    "question_set_opening_time",  
    "question_piece_id_list",  
    "question_piece_created_at",  
    "question_piece_is_voted",  
    "question_piece_is_skipped",  
    "question_created_at",  
    "user_candidate_id",  
    "user_candidate_created_at",  
    "user_candidate_user_id",  
    "question_report_id",  
    "question_report_reason",  
    "question_report_created_at"
]

# 순서 재배열
final = final[column_order]

In [12]:
final.head()

Unnamed: 0,user_id,question_id,question_text,question_set_id,question_set_status,question_set_created_at,question_set_opening_time,question_piece_id_list,question_piece_created_at,question_piece_is_voted,question_piece_is_skipped,question_created_at,user_candidate_id,user_candidate_created_at,user_candidate_user_id,question_report_id,question_report_reason,question_report_created_at
0,849436,252,손이 가장 이쁘게 생겼을거 같은 사람은?,99817,F,2023-04-28 12:27:23,2023-04-28 12:27:22,998458,2023-04-28 12:27:22,1,0,2023-04-01 11:09:27,3088872,2023-04-28 12:27:49,849444,,,NaT
1,849436,252,손이 가장 이쁘게 생겼을거 같은 사람은?,99817,F,2023-04-28 12:27:23,2023-04-28 12:27:22,998458,2023-04-28 12:27:22,1,0,2023-04-01 11:09:27,3088873,2023-04-28 12:27:49,849454,,,NaT
2,849436,252,손이 가장 이쁘게 생겼을거 같은 사람은?,99817,F,2023-04-28 12:27:23,2023-04-28 12:27:22,998458,2023-04-28 12:27:22,1,0,2023-04-01 11:09:27,3088874,2023-04-28 12:27:49,849460,,,NaT
3,849436,252,손이 가장 이쁘게 생겼을거 같은 사람은?,99817,F,2023-04-28 12:27:23,2023-04-28 12:27:22,998458,2023-04-28 12:27:22,1,0,2023-04-01 11:09:27,3088875,2023-04-28 12:27:49,849469,,,NaT
4,849762,252,손이 가장 이쁘게 생겼을거 같은 사람은?,100132,F,2023-04-28 12:36:53,2023-04-28 13:26:53,1001617,2023-04-28 12:36:53,1,0,2023-04-01 11:09:27,3131196,2023-04-28 13:41:41,849438,,,NaT


In [13]:
# 서로 다른 값이 있는 행 필터링
diff_rows = final[final["question_set_opening_time"] != final["question_piece_created_at"]]

# 개수 확인
print(f"서로 다른 행 개수: {len(diff_rows)}")

diff_rows.head()

서로 다른 행 개수: 4645676


Unnamed: 0,user_id,question_id,question_text,question_set_id,question_set_status,question_set_created_at,question_set_opening_time,question_piece_id_list,question_piece_created_at,question_piece_is_voted,question_piece_is_skipped,question_created_at,user_candidate_id,user_candidate_created_at,user_candidate_user_id,question_report_id,question_report_reason,question_report_created_at
4,849762,252,손이 가장 이쁘게 생겼을거 같은 사람은?,100132,F,2023-04-28 12:36:53,2023-04-28 13:26:53,1001617,2023-04-28 12:36:53,1,0,2023-04-01 11:09:27,3131196,2023-04-28 13:41:41,849438,,,NaT
5,849762,252,손이 가장 이쁘게 생겼을거 같은 사람은?,100132,F,2023-04-28 12:36:53,2023-04-28 13:26:53,1001617,2023-04-28 12:36:53,1,0,2023-04-01 11:09:27,3131197,2023-04-28 13:41:41,849446,,,NaT
6,849762,252,손이 가장 이쁘게 생겼을거 같은 사람은?,100132,F,2023-04-28 12:36:53,2023-04-28 13:26:53,1001617,2023-04-28 12:36:53,1,0,2023-04-01 11:09:27,3131198,2023-04-28 13:41:41,849453,,,NaT
7,849762,252,손이 가장 이쁘게 생겼을거 같은 사람은?,100132,F,2023-04-28 12:36:53,2023-04-28 13:26:53,1001617,2023-04-28 12:36:53,1,0,2023-04-01 11:09:27,3131199,2023-04-28 13:41:41,849543,,,NaT
12,850007,252,손이 가장 이쁘게 생겼을거 같은 사람은?,100898,F,2023-04-28 13:11:50,2023-04-28 13:32:07,1009274,2023-04-28 13:11:50,1,0,2023-04-01 11:09:27,3125612,2023-04-28 13:33:24,847375,,,NaT


In [14]:
%xdel diff_rows

In [15]:
# user_question_report가 NaN이 아닌 데이터
final_reason_notna = final[final['question_report_reason'].notna()]
final_reason_notna.shape

(723, 18)

In [16]:
# user_question_report가 NaN이 아닌 데이터
final_reason_notna.head()

Unnamed: 0,user_id,question_id,question_text,question_set_id,question_set_status,question_set_created_at,question_set_opening_time,question_piece_id_list,question_piece_created_at,question_piece_is_voted,question_piece_is_skipped,question_created_at,user_candidate_id,user_candidate_created_at,user_candidate_user_id,question_report_id,question_report_reason,question_report_created_at
63642,880437,201,화장을 제일 잘하는 사람?,542164,F,2023-05-06 06:50:14,2023-05-06 07:40:14,5421936,2023-05-06 06:50:14,1,0,2023-04-01 11:09:18,19840162,2023-05-06 16:00:34,887645,3441,자꾸 같은 내용의 질문 반복,2023-05-06 05:17:14
63643,880437,201,화장을 제일 잘하는 사람?,542164,F,2023-05-06 06:50:14,2023-05-06 07:40:14,5421936,2023-05-06 06:50:14,1,0,2023-04-01 11:09:18,19840163,2023-05-06 16:00:34,888965,3441,자꾸 같은 내용의 질문 반복,2023-05-06 05:17:14
63644,880437,201,화장을 제일 잘하는 사람?,542164,F,2023-05-06 06:50:14,2023-05-06 07:40:14,5421936,2023-05-06 06:50:14,1,0,2023-04-01 11:09:18,19840164,2023-05-06 16:00:34,904721,3441,자꾸 같은 내용의 질문 반복,2023-05-06 05:17:14
63645,880437,201,화장을 제일 잘하는 사람?,542164,F,2023-05-06 06:50:14,2023-05-06 07:40:14,5421936,2023-05-06 06:50:14,1,0,2023-04-01 11:09:18,19840165,2023-05-06 16:00:34,911793,3441,자꾸 같은 내용의 질문 반복,2023-05-06 05:17:14
107237,849692,257,가장 친해지고 싶었던 사람은?,144405,F,2023-04-30 03:53:28,2023-04-30 04:43:28,1444343,2023-04-30 03:53:28,1,0,2023-04-01 11:09:28,4619071,2023-04-30 07:00:13,849620,1074,기타,2023-04-29 13:25:27


In [18]:
final.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
user_id,4769680,1105848,838023,884633,1117319,1259413,1583358,206291.0
question_id,4769680,686,99,275,470,944,5133,627.0
question_set_id,4769680,7430172,99817,1891645,6639625,11788730,20835070,5761948.0
question_set_created_at,4769680,2023-05-16 18:38:30.859005440,2023-04-28 12:27:23,2023-05-09 13:52:16,2023-05-15 11:04:10,2023-05-21 17:09:03,2024-03-19 12:53:58,
question_set_opening_time,4769680,2023-05-16 19:20:58.912039168,2023-04-28 12:27:22,2023-05-09 14:35:57,2023-05-15 11:43:15,2023-05-21 17:49:02,2024-03-19 12:53:58,
question_piece_id_list,4769680,74302067,998458,18916763,66396532,117887588,208351468,57619578.0
question_piece_created_at,4769680,2023-05-16 18:38:30.800603136,2023-04-28 12:27:22,2023-05-09 13:52:16,2023-05-15 11:04:10,2023-05-21 17:09:02,2024-03-19 12:53:58,
question_piece_is_voted,4769680,1,1,1,1,1,1,0.0
question_piece_is_skipped,4769680,0,0,0,0,0,1,0.0
question_created_at,4769680,2023-04-28 00:36:33.187461120,2023-03-31 15:22:53,2023-04-01 11:09:33,2023-05-02 05:33:07,2023-05-15 14:00:12,2023-06-06 06:15:52,


1. 질문 세트가 만들어진 시점 2023.4.28 ~ 2024.3.19
2. 질문 세트가 열린 시점 2023.4.28 ~ 2024.3.19
3. 질문 조각이 만들어진 시점 2023.4.28 ~ 2024.3.19
4. 질문이 만들어진 시점 2023.3.31 ~ 2023.6.6 -> 시점 분리 애매
5. 질문 조각에 등장하는 유저 테이블이 만들어진 시점 2023.4.28 ~ 2024.5.8
6. 질문 신고가 만들어진 시점 2023.4.28 ~ 2023.5.28 -> 시점 분리 애매