In [113]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
from datetime import timedelta
import matplotlib.font_manager as fm
import matplotlib as mpl
import ast
import json
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.expand_frame_repr', False)

from collections import Counter
from kiwipiepy import Kiwi


In [58]:
def read_parquet_from_gcs(file_names, gcs_prefix, key_path):
    dfs = {}
    for name in file_names:
        path = f"{gcs_prefix}/{name}.parquet"
        dfs[name] = pd.read_parquet(path, storage_options={"token": key_path})
    return dfs


file_list = [
    "accounts_attendance",
    "accounts_blockrecord",
    "accounts_failpaymenthistory",
    "accounts_friendrequest",
    "accounts_group",
    "accounts_nearbyschool",
    "accounts_paymenthistory",
    "accounts_user_contacts",
    "accounts_pointhistory",
    "accounts_school",
    "accounts_timelinereport",
    "accounts_user",
    "accounts_userquestionrecord",
    "accounts_userwithdraw",
    "event_receipts",
    "events",
    "polls_question",
    "polls_questionpiece",
    "polls_questionreport",
    "polls_questionset",
    "polls_usercandidate"
]

gcs_prefix = "gs://codeit-project/votes"
key_path = "./config/key.json"

dfs = read_parquet_from_gcs(file_list, gcs_prefix, key_path)

# 출석 테이블
accounts_attendance = dfs["accounts_attendance"]

# 차단 기록 테이블
accounts_blockrecord = dfs["accounts_blockrecord"]

# 상품 구매 실패 기록 테이블
accounts_failpaymenthistory = dfs["accounts_failpaymenthistory"]

# 친구 요청 테이블
accounts_friendrequest = dfs["accounts_friendrequest"]

# 그룹 테이블
accounts_group = dfs["accounts_group"]

# 근처 학교 테이블
accounts_nearbyschool = dfs["accounts_nearbyschool"]

# 결제 기록 테이블
accounts_paymenthistory = dfs["accounts_paymenthistory"]

# 사용자 연락처 테이블
accounts_user_contacts = dfs["accounts_user_contacts"]

# 포인트 내역 테이블
accounts_pointhistory = dfs["accounts_pointhistory"]

# 학교 정보 테이블
accounts_school = dfs["accounts_school"]

# 타임라인 신고 기록 테이블
accounts_timelinereport = dfs["accounts_timelinereport"]

# 사용자 기본 정보 테이블
accounts_user = dfs["accounts_user"]

# 사용자 질문 응답 기록 테이블
accounts_userquestionrecord = dfs["accounts_userquestionrecord"]

# 회원 탈퇴 기록 테이블
accounts_userwithdraw = dfs["accounts_userwithdraw"]

# 이벤트 영수증 테이블
event_receipts = dfs["event_receipts"]

# 이벤트 테이블
events = dfs["events"]

# 질문 테이블
polls_question = dfs["polls_question"]

# 질문 조각 테이블
polls_questionpiece = dfs["polls_questionpiece"]

# 질문 신고 테이블
polls_questionreport = dfs["polls_questionreport"]

# 질문 세트 테이블
polls_questionset = dfs["polls_questionset"]

# 사용자 후보 응답 테이블
polls_usercandidate = dfs["polls_usercandidate"]

In [59]:
accounts_attendance.head()

Unnamed: 0,id,attendance_date_list,user_id
0,1,"[""2023-05-27"", ""2023-05-28"", ""2023-05-29"", ""20...",1446852
1,2,"[""2023-05-27"", ""2023-05-29"", ""2023-05-30"", ""20...",1359398
2,3,"[""2023-05-27"", ""2023-05-29"", ""2023-05-30"", ""20...",1501542
3,4,"[""2023-05-27"", ""2023-05-28"", ""2023-05-29"", ""20...",1507767
4,5,"[""2023-05-27"", ""2023-05-28"", ""2023-05-29"", ""20...",1287453


In [60]:
accounts_failpaymenthistory.head()

Unnamed: 0,id,productId,phone_type,created_at,user_id
0,6,heart.200,A,2023-05-14 05:49:22,1055891
1,7,heart.777,A,2023-05-14 08:17:21,1152151
2,8,heart.777,A,2023-05-14 10:11:46,986200
3,9,heart.1000,A,2023-05-14 11:53:09,1028261
4,10,heart.777,A,2023-05-14 12:30:47,1235730


In [61]:
accounts_failpaymenthistory.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 163 entries, 0 to 162
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   id          163 non-null    int64         
 1   productId   56 non-null     object        
 2   phone_type  163 non-null    object        
 3   created_at  163 non-null    datetime64[ns]
 4   user_id     163 non-null    int64         
dtypes: datetime64[ns](1), int64(2), object(2)
memory usage: 6.5+ KB


In [62]:
accounts_failpaymenthistory.isna().sum()

id              0
productId     107
phone_type      0
created_at      0
user_id         0
dtype: int64

In [63]:
accounts_friendrequest.head()

Unnamed: 0,id,status,created_at,updated_at,receive_user_id,send_user_id
0,7,P,2023-04-17 18:29:11,2023-04-17 18:29:11,831962,837521
1,10,A,2023-04-17 18:29:11,2023-04-22 06:02:53,832151,837521
2,11,A,2023-04-17 18:29:11,2023-04-18 19:28:41,832340,837521
3,13,A,2023-04-17 18:29:11,2023-04-19 11:05:04,833041,837521
4,20,P,2023-04-17 18:29:11,2023-04-17 18:29:11,834415,837521


In [64]:
# 중복행 탐색
accounts_friendrequest.duplicated().sum()

np.int64(0)

In [65]:
accounts_group[accounts_group['school_id'] == 4383].sort_values(by  = 'grade', ascending = True)

Unnamed: 0,id,grade,class_num,school_id
40592,40614,1,2,4383
54375,54400,1,3,4383
58686,58711,1,4,4383
70593,70619,1,5,4383
70614,70640,1,6,4383
76437,76464,1,4,4383
71595,71621,2,5,4383
68448,68474,2,1,4383
75729,75756,2,3,4383
75559,75586,2,4,4383


In [66]:
accounts_user.head()

Unnamed: 0,id,is_superuser,is_staff,gender,point,friend_id_list,is_push_on,created_at,block_user_id_list,hide_user_id_list,ban_status,report_count,alarm_count,pending_chat,pending_votes,group_id
0,831956,1,1,,600,"[1292473, 913158, 1488461, 1064695, 1043565, 1...",0,2023-03-29 03:44:14.047130,[],[],N,0,0,0,0,
1,831962,0,0,F,2248,"[833025, 832642, 982531, 879496, 838541, 83752...",1,2023-03-29 05:18:56.162368,[],[],N,253,40878,5499,110,12.0
2,832151,0,0,M,1519,"[838785, 982531, 882567, 879496, 838541, 83649...",0,2023-03-29 12:56:34.989468,[],[],N,0,37,0,47,1.0
3,832340,0,0,F,57,"[841345, 982531, 838785, 963714, 882567, 83252...",1,2023-03-29 12:56:35.020790,[],[],N,0,19,0,21,1.0
4,832520,0,0,M,1039,"[874050, 849763, 874212, 844297, 838541, 84004...",0,2023-03-29 12:56:35.049311,[],[],N,0,29,0,15,12.0


In [67]:
accounts_userquestionrecord.head()

Unnamed: 0,id,status,created_at,chosen_user_id,question_id,user_id,question_piece_id,has_read,answer_status,answer_updated_at,report_count,opened_times
0,771777,C,2023-04-28 12:27:49,849469,252,849436,998458,0,N,2023-04-28 12:27:49,0,0
1,771800,C,2023-04-28 12:28:02,849446,244,849436,998459,0,N,2023-04-28 12:28:02,0,0
2,771812,C,2023-04-28 12:28:09,849454,183,849436,998460,1,N,2023-04-28 12:28:09,0,0
3,771828,C,2023-04-28 12:28:16,847375,101,849436,998461,0,N,2023-04-28 12:28:16,0,0
4,771851,C,2023-04-28 12:28:26,849477,209,849436,998462,1,N,2023-04-28 12:28:26,0,0


In [68]:
accounts_userquestionrecord.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1217558 entries, 0 to 1217557
Data columns (total 12 columns):
 #   Column             Non-Null Count    Dtype         
---  ------             --------------    -----         
 0   id                 1217558 non-null  int64         
 1   status             1217558 non-null  object        
 2   created_at         1217558 non-null  datetime64[ns]
 3   chosen_user_id     1217558 non-null  int64         
 4   question_id        1217558 non-null  int64         
 5   user_id            1217558 non-null  int64         
 6   question_piece_id  1217558 non-null  int64         
 7   has_read           1217558 non-null  int64         
 8   answer_status      1217558 non-null  object        
 9   answer_updated_at  1217558 non-null  datetime64[ns]
 10  report_count       1217558 non-null  int64         
 11  opened_times       1217558 non-null  int64         
dtypes: datetime64[ns](2), int64(8), object(2)
memory usage: 111.5+ MB


In [69]:
accounts_userquestionrecord[accounts_userquestionrecord['user_id'] == 849436].head()

Unnamed: 0,id,status,created_at,chosen_user_id,question_id,user_id,question_piece_id,has_read,answer_status,answer_updated_at,report_count,opened_times
0,771777,C,2023-04-28 12:27:49,849469,252,849436,998458,0,N,2023-04-28 12:27:49,0,0
1,771800,C,2023-04-28 12:28:02,849446,244,849436,998459,0,N,2023-04-28 12:28:02,0,0
2,771812,C,2023-04-28 12:28:09,849454,183,849436,998460,1,N,2023-04-28 12:28:09,0,0
3,771828,C,2023-04-28 12:28:16,847375,101,849436,998461,0,N,2023-04-28 12:28:16,0,0
4,771851,C,2023-04-28 12:28:26,849477,209,849436,998462,1,N,2023-04-28 12:28:26,0,0


In [70]:
polls_questionpiece.head(30)

Unnamed: 0,id,is_voted,created_at,question_id,is_skipped
0,998458,1,2023-04-28 12:27:22,252,0
1,998459,1,2023-04-28 12:27:22,244,0
2,998460,1,2023-04-28 12:27:22,183,0
3,998461,1,2023-04-28 12:27:22,101,0
4,998462,1,2023-04-28 12:27:22,209,0
5,998463,1,2023-04-28 12:27:22,239,0
6,998464,1,2023-04-28 12:27:22,146,0
7,998465,1,2023-04-28 12:27:22,297,0
8,998466,1,2023-04-28 12:27:22,294,0
9,998467,1,2023-04-28 12:27:22,201,0


In [71]:
polls_questionpiece.id.nunique()

1265476

In [72]:
# polls_questionpiece csv 파일로 다운로드
polls_questionpiece.to_csv('polls_questionpiece.csv', index=False)


In [73]:
# created_at을 datetime으로 변환 (만약 아직 문자열이면)
polls_questionpiece['created_at'] = pd.to_datetime(polls_questionpiece['created_at'])

# 조건 필터링
filtered_df = polls_questionpiece[
    (polls_questionpiece['question_id'] == 119) &
    (polls_questionpiece['created_at'] >= '2023-04-28 13:10:00') &
    (polls_questionpiece['created_at'] <= '2023-04-28 14:00:00')
]

# 결과 확인
print(filtered_df)


           id  is_voted          created_at  question_id  is_skipped
709   1009127         1 2023-04-28 13:11:02          119           0
710   1009132         1 2023-04-28 13:11:02          119           0
873   1012290         1 2023-04-28 13:27:23          119           0
940   1013081         1 2023-04-28 13:30:03          119           0
1017  1013856         1 2023-04-28 13:34:00          119           0
1238  1018228         1 2023-04-28 13:54:32          119           0
1324  1019466         1 2023-04-28 13:59:05          119           0


In [74]:
polls_questionpiece.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1265476 entries, 0 to 1265475
Data columns (total 5 columns):
 #   Column       Non-Null Count    Dtype         
---  ------       --------------    -----         
 0   id           1265476 non-null  int64         
 1   is_voted     1265476 non-null  int64         
 2   created_at   1265476 non-null  datetime64[ns]
 3   question_id  1265476 non-null  int64         
 4   is_skipped   1265476 non-null  int64         
dtypes: datetime64[ns](1), int64(4)
memory usage: 48.3 MB


In [75]:
# 조건에 맞는 행 필터링
unprocessed_df = polls_questionpiece[
    (polls_questionpiece['is_voted'] == 0) & 
    (polls_questionpiece['is_skipped'] == 0)
]

# 결과 확인
unprocessed_df.head()


Unnamed: 0,id,is_voted,created_at,question_id,is_skipped
1084,1015518,0,2023-04-28 13:41:27,268,0
1085,1015519,0,2023-04-28 13:41:27,184,0
1086,1015520,0,2023-04-28 13:41:27,141,0
1087,1015521,0,2023-04-28 13:41:27,172,0
1088,1015522,0,2023-04-28 13:41:27,259,0


In [76]:
unprocessed_df.head(20)

Unnamed: 0,id,is_voted,created_at,question_id,is_skipped
1084,1015518,0,2023-04-28 13:41:27,268,0
1085,1015519,0,2023-04-28 13:41:27,184,0
1086,1015520,0,2023-04-28 13:41:27,141,0
1087,1015521,0,2023-04-28 13:41:27,172,0
1088,1015522,0,2023-04-28 13:41:27,259,0
1089,1015523,0,2023-04-28 13:41:27,321,0
1090,1015524,0,2023-04-28 13:41:27,196,0
1091,1015525,0,2023-04-28 13:41:27,127,0
1092,1015526,0,2023-04-28 13:41:27,293,0
1093,1015527,0,2023-04-28 13:41:27,317,0


In [77]:
# 2023-04-28 14:19:21
unprocessed_df[unprocessed_df['created_at'] == '2023-04-28 14:19:21']

Unnamed: 0,id,is_voted,created_at,question_id,is_skipped
1599,1023569,0,2023-04-28 14:19:21,303,0
1600,1023570,0,2023-04-28 14:19:21,214,0
1601,1023571,0,2023-04-28 14:19:21,234,0
1602,1023572,0,2023-04-28 14:19:21,191,0
1603,1023573,0,2023-04-28 14:19:21,321,0
1604,1023574,0,2023-04-28 14:19:21,144,0
1605,1023575,0,2023-04-28 14:19:21,153,0
1606,1023576,0,2023-04-28 14:19:21,156,0
1607,1023577,0,2023-04-28 14:19:21,112,0


In [78]:
unprocessed_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 46789 entries, 1084 to 1265475
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   id           46789 non-null  int64         
 1   is_voted     46789 non-null  int64         
 2   created_at   46789 non-null  datetime64[ns]
 3   question_id  46789 non-null  int64         
 4   is_skipped   46789 non-null  int64         
dtypes: datetime64[ns](1), int64(4)
memory usage: 2.1 MB


In [79]:
# is_voted == 0 and is_skipped == 0인 행 기준으로 그룹
created_at_counts = unprocessed_df['created_at'].value_counts().sort_index()

# 결과 출력
print(created_at_counts)


created_at
2023-04-28 13:41:27    10
2023-04-28 14:19:21     9
2023-04-29 04:43:25    10
2023-04-29 13:28:26    10
2023-04-29 13:59:53     6
2023-04-29 13:59:54     4
2023-04-29 16:50:41    10
2023-04-30 04:43:54    10
2023-04-30 05:33:39    10
2023-04-30 09:39:13     7
2023-04-30 10:36:28    10
2023-04-30 14:10:34    10
2023-04-30 14:49:42    10
2023-04-30 15:04:16     7
2023-04-30 15:38:28    10
2023-04-30 16:17:37    10
2023-05-01 04:20:12    10
2023-05-01 07:08:59    10
2023-05-01 11:47:20    10
2023-05-01 11:55:03    10
2023-05-02 05:17:11     6
2023-05-02 06:55:40    10
2023-05-02 10:25:07    10
2023-05-02 11:19:30    10
2023-05-02 12:09:44    10
2023-05-02 14:01:02    10
2023-05-02 14:58:00     9
2023-05-02 15:57:09    10
2023-05-02 16:40:58    10
2023-05-02 18:32:29     9
2023-05-02 22:37:05     3
2023-05-02 23:26:30     8
2023-05-03 00:39:04     9
2023-05-03 04:09:44     4
2023-05-03 06:32:19    10
2023-05-03 08:03:53    10
2023-05-03 09:15:31     6
2023-05-03 09:24:14    10
2

In [80]:
# created_at_counts를 DataFrame으로 변환
created_at_counts_df = created_at_counts.reset_index()
created_at_counts_df.columns = ['created_at', 'count']
# created_at을 datetime으로 변환
created_at_counts_df['created_at'] = pd.to_datetime(created_at_counts_df['created_at'])
# created_at을 기준으로 정렬
created_at_counts_df = created_at_counts_df.sort_values(by='created_at')

In [81]:
created_at_counts_df.head()

Unnamed: 0,created_at,count
0,2023-04-28 13:41:27,10
1,2023-04-28 14:19:21,9
2,2023-04-29 04:43:25,10
3,2023-04-29 13:28:26,10
4,2023-04-29 13:59:53,6


In [82]:
created_at_counts_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5316 entries, 0 to 5315
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   created_at  5316 non-null   datetime64[ns]
 1   count       5316 non-null   int64         
dtypes: datetime64[ns](1), int64(1)
memory usage: 83.2 KB


In [83]:
created_at_counts_df['count'].value_counts()

count
10    3806
9      264
7      206
8      195
6      191
5      156
4      127
3      123
1      123
2      114
20       4
19       2
17       2
11       1
16       1
18       1
Name: count, dtype: int64

In [84]:
unprocessed = polls_questionpiece[
    (polls_questionpiece['is_voted'] == 1) & (polls_questionpiece['is_skipped'] == 1)
]
print(f"총 {len(unprocessed)}개의 이상 행이 존재합니다.")

총 1127개의 이상 행이 존재합니다.


### is_voted, is_skipped가 다 0인 경우?

In [85]:
polls_questionpiece.head()

Unnamed: 0,id,is_voted,created_at,question_id,is_skipped
0,998458,1,2023-04-28 12:27:22,252,0
1,998459,1,2023-04-28 12:27:22,244,0
2,998460,1,2023-04-28 12:27:22,183,0
3,998461,1,2023-04-28 12:27:22,101,0
4,998462,1,2023-04-28 12:27:22,209,0


In [86]:
polls_questionreport.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51424 entries, 0 to 51423
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   id           51424 non-null  int64         
 1   reason       51424 non-null  object        
 2   created_at   51424 non-null  datetime64[ns]
 3   question_id  51424 non-null  int64         
 4   user_id      51424 non-null  int64         
dtypes: datetime64[ns](1), int64(3), object(1)
memory usage: 2.0+ MB


In [87]:
polls_questionreport.head()

Unnamed: 0,id,reason,created_at,question_id,user_id
0,1,이 질문은 재미없어요,2023-04-19 06:20:35,250,837556
1,2,이 질문은 재미없어요,2023-04-19 06:58:09,113,837672
2,3,불쾌한 내용이 포함되어 있음,2023-04-19 06:58:17,113,837672
3,4,어떻게 이런 생각을? 이 질문 최고!,2023-04-19 08:12:42,119,837922
4,5,어떻게 이런 생각을? 이 질문 최고!,2023-04-19 08:12:50,119,837922


In [88]:
polls_question['id'].min(), polls_question['id'].max()

(np.int64(99), np.int64(5133))

In [89]:
polls_questionpiece['question_id'].min(), polls_questionpiece['question_id'].max()

(np.int64(99), np.int64(5133))

In [90]:
polls_questionreport['question_id'].min(), polls_questionreport['question_id'].max()

(np.int64(99), np.int64(5110))

In [91]:
# 조건에 맞는 행 필터링
unprocessed_df = polls_questionpiece[
    (polls_questionpiece['is_voted'] == 0) & 
    (polls_questionpiece['is_skipped'] == 0)
]

# 결과 확인
unprocessed_df.head()

Unnamed: 0,id,is_voted,created_at,question_id,is_skipped
1084,1015518,0,2023-04-28 13:41:27,268,0
1085,1015519,0,2023-04-28 13:41:27,184,0
1086,1015520,0,2023-04-28 13:41:27,141,0
1087,1015521,0,2023-04-28 13:41:27,172,0
1088,1015522,0,2023-04-28 13:41:27,259,0


In [92]:
unprocessed_df.question_id.nunique(), unprocessed_df.id.nunique()

(4645, 46789)

In [93]:
unprocessed_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 46789 entries, 1084 to 1265475
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   id           46789 non-null  int64         
 1   is_voted     46789 non-null  int64         
 2   created_at   46789 non-null  datetime64[ns]
 3   question_id  46789 non-null  int64         
 4   is_skipped   46789 non-null  int64         
dtypes: datetime64[ns](1), int64(4)
memory usage: 2.1 MB


In [94]:
unprocessed_df.id.nunique()

46789

In [95]:
polls_questionreport.head()

Unnamed: 0,id,reason,created_at,question_id,user_id
0,1,이 질문은 재미없어요,2023-04-19 06:20:35,250,837556
1,2,이 질문은 재미없어요,2023-04-19 06:58:09,113,837672
2,3,불쾌한 내용이 포함되어 있음,2023-04-19 06:58:17,113,837672
3,4,어떻게 이런 생각을? 이 질문 최고!,2023-04-19 08:12:42,119,837922
4,5,어떻게 이런 생각을? 이 질문 최고!,2023-04-19 08:12:50,119,837922


In [96]:
# 신고된 질문 ID 목록
reported_q_ids = polls_questionreport['question_id'].unique()

# 신고된 질문 중 응답 없는 것 필터링
unprocessed_reported = unprocessed_df[unprocessed_df['question_id'].isin(reported_q_ids)]

print(f"응답 없이 신고된 질문 수: {len(unprocessed_reported)}")


응답 없이 신고된 질문 수: 40188


In [97]:
unprocessed_reported.head()

Unnamed: 0,id,is_voted,created_at,question_id,is_skipped
1084,1015518,0,2023-04-28 13:41:27,268,0
1085,1015519,0,2023-04-28 13:41:27,184,0
1086,1015520,0,2023-04-28 13:41:27,141,0
1087,1015521,0,2023-04-28 13:41:27,172,0
1088,1015522,0,2023-04-28 13:41:27,259,0


In [98]:
unprocessed_reported.id.nunique()

40188

In [99]:
unprocessed_df['id'].nunique()

46789

In [100]:
# 교집합 개수
common_ids = set(unprocessed_reported['id']) & set(unprocessed_df['id'])

print(f"두 DataFrame 간 id 일치 개수: {len(common_ids)}")

두 DataFrame 간 id 일치 개수: 40188


In [101]:
unprocessed_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 46789 entries, 1084 to 1265475
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   id           46789 non-null  int64         
 1   is_voted     46789 non-null  int64         
 2   created_at   46789 non-null  datetime64[ns]
 3   question_id  46789 non-null  int64         
 4   is_skipped   46789 non-null  int64         
dtypes: datetime64[ns](1), int64(4)
memory usage: 2.1 MB


In [102]:
unprocessed_df.question_id.nunique()

4645

In [103]:
# 1. 신고된 question_id만 따로
reported_q_ids = polls_questionreport['question_id'].unique()

# 2. 신고 안 된 미응답만 추출
unprocessed_unreported = unprocessed_df[~unprocessed_df['question_id'].isin(reported_q_ids)]

print(f"신고도 안 되고 응답도 안 된 질문 수: {len(unprocessed_unreported)}")


신고도 안 되고 응답도 안 된 질문 수: 6601


### 나머지 6601건의 정체는 무엇인가

In [104]:
# 신고된 질문 ID 목록
reported_q_ids = polls_questionreport['question_id'].unique()

# 신고된 question_id를 포함하는 조각 중 미응답 조각
reported_unprocessed = polls_questionpiece[
    (polls_questionpiece['question_id'].isin(reported_q_ids)) &
    (polls_questionpiece['is_voted'] == 0) &
    (polls_questionpiece['is_skipped'] == 0)
]
print(f"신고된 질문 중 응답 없는 조각 수: {len(reported_unprocessed)}")

신고된 질문 중 응답 없는 조각 수: 40188


In [105]:
# questionset에서 questionpiece_id_list explode
polls_questionset["question_piece_id_list"] = polls_questionset["question_piece_id_list"].apply(ast.literal_eval)
qs_exploded = polls_questionset.explode("question_piece_id_list").rename(columns={"question_piece_id_list": "questionpiece_id"})
qs_exploded["questionpiece_id"] = qs_exploded["questionpiece_id"].astype(int)

# 미응답 신고 조각에 해당하는 질문 세트 ID 확보
reported_unprocessed = reported_unprocessed.merge(
    qs_exploded[["id", "user_id", "questionpiece_id"]],
    left_on="id", right_on="questionpiece_id", how="left"
)

In [106]:
reported_unprocessed.head()

Unnamed: 0,id_x,is_voted,created_at,question_id,is_skipped,id_y,user_id,questionpiece_id
0,1015518,0,2023-04-28 13:41:27,268,0,101523,850120,1015518
1,1015519,0,2023-04-28 13:41:27,184,0,101523,850120,1015519
2,1015520,0,2023-04-28 13:41:27,141,0,101523,850120,1015520
3,1015521,0,2023-04-28 13:41:27,172,0,101523,850120,1015521
4,1015522,0,2023-04-28 13:41:27,259,0,101523,850120,1015522


In [107]:
# 해당 세트의 다른 조각들 불러오기
related_sets = qs_exploded[qs_exploded["questionpiece_id"].isin(reported_unprocessed["id_y"])]


# 다른 조각의 응답 여부 확인 (polls_questionpiece와 다시 merge)
other_pieces = related_sets.merge(polls_questionpiece, left_on="questionpiece_id", right_on="id", how="left")

# 신고된 조각 이후 조각 중 정상 응답한 것만 필터
responded_after = other_pieces[
    (other_pieces["is_voted"] == 1) | (other_pieces["is_skipped"] == 1)
]
responded_after.head()

Unnamed: 0,id_x,questionpiece_id,opening_time,status,created_at_x,user_id,id_y,is_voted,created_at_y,question_id,is_skipped
0,100496,1005253,2023-04-28 13:42:21,F,2023-04-28 12:52:21,849443,1005253.0,1.0,2023-04-28 12:52:21,128.0,0.0
1,105265,1052942,2023-04-28 18:23:34,F,2023-04-28 17:33:34,850718,1052942.0,1.0,2023-04-28 17:33:34,312.0,0.0
2,105298,1053269,2023-04-28 18:29:19,F,2023-04-28 17:39:19,850412,1053269.0,1.0,2023-04-28 17:39:19,199.0,0.0
3,108676,1087050,2023-04-29 04:22:13,F,2023-04-29 03:32:13,850610,1087050.0,1.0,2023-04-29 03:32:13,285.0,0.0
4,111467,1114958,2023-04-29 07:12:38,F,2023-04-29 06:22:38,849559,1114958.0,1.0,2023-04-29 06:22:38,210.0,0.0


In [108]:
responded_after.info()

<class 'pandas.core.frame.DataFrame'>
Index: 61 entries, 0 to 74
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   id_x              61 non-null     int64         
 1   questionpiece_id  61 non-null     int64         
 2   opening_time      61 non-null     datetime64[ns]
 3   status            61 non-null     object        
 4   created_at_x      61 non-null     datetime64[ns]
 5   user_id           61 non-null     int64         
 6   id_y              61 non-null     float64       
 7   is_voted          61 non-null     float64       
 8   created_at_y      61 non-null     datetime64[ns]
 9   question_id       61 non-null     float64       
 10  is_skipped        61 non-null     float64       
dtypes: datetime64[ns](3), float64(4), int64(3), object(1)
memory usage: 5.7+ KB


In [109]:
polls_questionset.head()

Unnamed: 0,id,question_piece_id_list,opening_time,status,created_at,user_id
0,99817,"[998458, 998459, 998460, 998461, 998462, 99846...",2023-04-28 12:27:22,F,2023-04-28 12:27:23,849436
1,99830,"[998588, 998589, 998590, 998591, 998592, 99859...",2023-04-28 12:28:07,F,2023-04-28 12:28:07,849438
2,99840,"[998689, 998691, 998693, 998695, 998697, 99869...",2023-04-28 12:28:38,F,2023-04-28 12:28:38,847375
3,99841,"[998688, 998690, 998692, 998694, 998696, 99869...",2023-04-28 12:28:38,F,2023-04-28 12:28:38,849446
4,99848,"[998768, 998769, 998770, 998771, 998772, 99877...",2023-04-28 12:28:57,F,2023-04-28 12:28:57,849477


In [110]:
polls_questionset.to_csv('polls_questionset.csv', index=False)

In [111]:
polls_questionreport.to_csv('polls_questionreport.csv', index=False)

In [117]:
# 2. 문자열로 저장된 리스트를 실제 리스트로 변환 (안전하게)
def safe_eval(val):
    if isinstance(val, str):
        return ast.literal_eval(val)
    return val

polls_questionset['question_piece_id_list'] = polls_questionset['question_piece_id_list'].apply(safe_eval)

# 3. explode로 question_piece_id 리스트를 풀어냄
qs_exploded = polls_questionset.explode("question_piece_id_list").rename(columns={"question_piece_id_list": "questionpiece_id"})
qs_exploded["questionpiece_id"] = qs_exploded["questionpiece_id"].astype(int)

# 4. 조각 정보(polls_questionpiece)와 merge
merged = qs_exploded.merge(
    polls_questionpiece,
    left_on="questionpiece_id",
    right_on="id",
    suffixes=('_set', '_piece')
)

# 5. 응답 여부 계산: is_voted + is_skipped > 0이면 응답한 것
merged["is_answered"] = merged["is_voted"] + merged["is_skipped"]

# 6. 세트-유저 기준으로 응답 / 미응답 개수 집계
set_user_response = merged.groupby(["user_id", "id_set"]).agg(
    total_q=("questionpiece_id", "count"),
    unanswered_q=("is_answered", lambda x: (x == 0).sum()),
    answered_q=("is_answered", lambda x: (x > 0).sum())
).reset_index()

# 7. 중간 이탈로 추정되는 세트 필터링
partial_quits = set_user_response[
    (set_user_response["answered_q"] > 0) & 
    (set_user_response["unanswered_q"] > 0)
]

# 결과 확인
print(partial_quits.head(10))

      user_id    id_set  total_q  unanswered_q  answered_q
30     838642    102328       10             9           1
457    844453  18476506       10             9           1
486    844462  19350951       10             9           1
546    845238  19103167       10             8           2
1112   849444    347715       10             9           1
1190   849446   6655675       10             7           3
1578   849475  16285144       10             4           6
1902   849491  15463625       10             7           3
1973   849497    359574       10             8           2
2303   849528   1005253       10             4           6


In [118]:
partial_quits.info()

<class 'pandas.core.frame.DataFrame'>
Index: 828 entries, 30 to 156532
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   user_id       828 non-null    int64
 1   id_set        828 non-null    int64
 2   total_q       828 non-null    int64
 3   unanswered_q  828 non-null    int64
 4   answered_q    828 non-null    int64
dtypes: int64(5)
memory usage: 38.8 KB


In [119]:
partial_quits.to_csv('partial_quits.csv', index=False)