# Library

In [1]:
import pandas as pd
import numpy as np
from dataset import Dataset
from dataread import Dataread
import seaborn as sns
import matplotlib.pyplot as plt
import os
import fsspec

sns.set_style("whitegrid")
plt.rcParams['font.family'] = 'NanumGothic'
plt.rcParams['axes.unicode_minus'] = False


ds = Dataset()
dr = Dataread()
JSON_KEY_PATH = "./sprintda05-soomin.json"
os.environ["GOOGlE_APPLICATION_CREDENTIALS"] = JSON_KEY_PATH
fs = fsspec.filesystem("gcs")

# 열람건수 비율

In [2]:
# 기본 데이터
record = (
    ds.votes.accounts_userquestionrecord()
    .query('created_at < "2023-08-01"')
    .query('created_at >= "2023-04-28"')
)

master_table = pd.read_csv('./integrated_master_table.csv')

has_read_table = record[['chosen_user_id','has_read','id']].copy()

user_table = master_table[['user_id','retention_day8']].copy()

# 투표 받은 수 카운트
has_read_table['total_voted'] =  (
    has_read_table
    .groupby(['chosen_user_id'])['id']
    .transform('count')
)

# 읽은 수 카운트
has_read_table['total_read'] = (
    has_read_table
    .groupby(['chosen_user_id'])['has_read']
    .transform('sum')
)
# 컬럼 정리
has_read_table = has_read_table[['chosen_user_id','total_voted','total_read']].rename(columns={'chosen_user_id':'user_id'}).drop_duplicates(subset='user_id')

# 유저 마스터 테이블과 합치기
user_table = user_table.merge(
    has_read_table,
    on='user_id',
    how='left'
)

In [None]:
has_read_table.to_csv('./dataset/total_voted')

Unnamed: 0,user_id,total_voted,total_read
0,849469,245,23
1,849446,505,42
2,849454,447,446
3,847375,832,52
4,849477,290,288
...,...,...,...
1211712,1460785,1,0
1211979,1093370,1,0
1212010,1575359,1,0
1212013,1512737,1,1


# 가입 후 n일 이내 결제 유무

In [5]:
ds.votes.file_list()

['accounts_attendance',
 'accounts_blockrecord',
 'accounts_failpaymenthistory',
 'accounts_friendrequest',
 'accounts_group',
 'accounts_nearbyschool',
 'accounts_paymenthistory',
 'accounts_pointhistory',
 'accounts_school',
 'accounts_timelinereport',
 'accounts_user',
 'accounts_user_contacts',
 'accounts_userquestionrecord',
 'accounts_userwithdraw',
 'event_receipts',
 'events',
 'polls_question',
 'polls_questionpiece',
 'polls_questionreport',
 'polls_questionset',
 'polls_usercandidate']

In [3]:
df = (ds.votes.accounts_userquestionrecord()
    .query("created_at < '2023-08-01' and created_at >= '2023-04-28'")
    .assign(day = lambda df: pd.to_datetime(df['created_at']).dt.floor('D'))
    )          
df                                

Unnamed: 0,id,status,created_at,chosen_user_id,question_id,user_id,question_piece_id,has_read,answer_status,answer_updated_at,report_count,opened_times,day
0,771777,C,2023-04-28 12:27:49,849469,252,849436,998458,0,N,2023-04-28 12:27:49,0,0,2023-04-28
1,771800,C,2023-04-28 12:28:02,849446,244,849436,998459,0,N,2023-04-28 12:28:02,0,0,2023-04-28
2,771812,C,2023-04-28 12:28:09,849454,183,849436,998460,1,N,2023-04-28 12:28:09,0,0,2023-04-28
3,771828,C,2023-04-28 12:28:16,847375,101,849436,998461,0,N,2023-04-28 12:28:16,0,0,2023-04-28
4,771851,C,2023-04-28 12:28:26,849477,209,849436,998462,1,N,2023-04-28 12:28:26,0,0,2023-04-28
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1212211,160262886,C,2023-07-31 16:06:08,859802,1653,866250,187023487,0,N,2023-07-31 16:06:08,0,0,2023-07-31
1212212,160262893,C,2023-07-31 16:06:14,850774,1967,866250,187023488,0,N,2023-07-31 16:06:14,0,0,2023-07-31
1212213,160262898,C,2023-07-31 16:06:19,1205810,2948,866250,187023489,0,N,2023-07-31 16:06:19,0,0,2023-07-31
1212214,160262908,I,2023-07-31 16:06:28,865846,2253,866250,187023490,1,N,2023-07-31 16:06:28,0,2,2023-07-31


In [4]:
# 최초 날짜 로직
df_sorted = df.sort_values(by=['user_id','day'])
df_sorted_one = df_sorted.drop_duplicates(subset='user_id', keep='first')[['user_id','created_at']]
df_sorted_one

Unnamed: 0,user_id,created_at
12411,838023,2023-04-29 16:22:56
55708,838466,2023-05-02 22:42:05
903,838642,2023-04-28 14:18:07
12717,839357,2023-04-29 16:51:01
57398,840293,2023-05-03 02:24:05
...,...,...
1207929,1577436,2023-07-05 06:29:21
1206386,1577437,2023-06-29 08:07:15
1206421,1577440,2023-06-29 09:03:48
1209331,1578095,2023-07-11 13:29:13


In [5]:
user_list = df.user_id.unique()
user_df = pd.DataFrame(user_list, columns=['user_id'])
user_df

Unnamed: 0,user_id
0,849436
1,847375
2,849438
3,849479
4,849441
...,...
4840,1577440
4841,1577436
4842,857659
4843,1578095


In [6]:
user_created = (
    ds.votes.accounts_user()
    .query("id in @user_list")
)

In [7]:
# 포인트 사용 테이블 읽어오기
point_history_df = ds.votes.accounts_pointhistory().loc[ds.votes.accounts_pointhistory()['delta_point'] < 0]
point_history_df

Unnamed: 0,id,delta_point,created_at,user_id,user_question_record_id
1030,808783,-300,2023-04-28 14:31:21,849479,773997.0
1036,808861,-300,2023-04-28 14:31:49,849452,788254.0
1110,810295,-300,2023-04-28 14:41:47,849762,787307.0
1241,812641,-300,2023-04-28 14:55:08,849670,782368.0
1443,815628,-300,2023-04-28 15:15:43,849439,790323.0
...,...,...,...,...,...
2338838,340661110,-500,2024-04-30 10:33:03,1209776,70382111.0
2338839,340661111,-1000,2024-04-30 10:33:13,1209776,70382111.0
2338870,340668595,-200,2024-05-05 14:06:53,1251933,89164250.0
2338906,340671283,-500,2024-05-07 02:26:58,851491,3774736.0


In [8]:
point_history_df = (
    point_history_df
    .sort_values(by=['user_id','created_at'])
    .drop_duplicates(subset='user_id')
)
point_history_df

Unnamed: 0,id,delta_point,created_at,user_id,user_question_record_id
27111,1360127,-300,2023-04-30 14:27:35,838023,1281323.0
170410,5055264,-300,2023-05-05 07:32:34,838466,3524303.0
19162,1182553,-300,2023-04-30 06:49:01,838642,1068384.0
16113,1107427,-300,2023-04-30 02:19:41,839357,1035633.0
210125,5596643,-300,2023-05-05 14:49:11,840293,3798163.0
...,...,...,...,...,...
2272250,318278139,-200,2023-06-08 08:31:10,1571821,151205021.0
2333926,339377675,-200,2023-09-20 13:12:05,1576039,161095445.0
2302626,328376445,-200,2023-06-17 11:04:28,1576107,155931559.0
2314798,332869984,-200,2023-06-30 06:11:37,1576705,158223837.0


In [9]:
# 지표 dataframe 생성

# 회원가입한 날짜와 연결
working_merge = pd.merge(
    user_df,
    user_created[['id','created_at']],
    left_on='user_id',
    right_on='id'
).drop(columns='id')

working_merge = working_merge.rename(columns={'created_at':'sign_up_date'})

# 처음 투표한 날짜와 연결
working_merge = pd.merge(
    working_merge,
    df_sorted_one,
    on = 'user_id'
)
working_merge = working_merge.rename(columns={'created_at':'first_vote_date'})

# 처음 포인트 쓴 날과 양 연결
working_merge = pd.merge(
    working_merge,
    point_history_df[['user_id','created_at','delta_point']],
    on = 'user_id',
    how='left'
)
working_merge = working_merge.rename(columns={'created_at':'first_point_use_date', 'delta_point':'first_point_use_amount'})


# 중간 결과물
working_merge

Unnamed: 0,user_id,sign_up_date,first_vote_date,first_point_use_date,first_point_use_amount
0,849436,2023-04-28 02:54:59.981972,2023-04-28 12:27:49,2023-04-28 17:44:51,-300.0
1,847375,2023-04-26 11:32:36.470049,2023-04-28 12:28:46,2023-04-29 09:09:36,-300.0
2,849438,2023-04-28 02:56:02.571340,2023-04-28 12:29:29,2023-04-30 13:37:52,-300.0
3,849479,2023-04-28 03:27:32.406440,2023-04-28 12:29:52,2023-04-28 14:31:21,-300.0
4,849441,2023-04-28 02:56:56.323854,2023-04-28 12:30:06,NaT,
...,...,...,...,...,...
4840,1577440,2023-06-29 08:36:52.982230,2023-06-29 09:03:48,2023-06-30 05:36:03,-200.0
4841,1577436,2023-06-29 08:03:16.561688,2023-07-05 06:29:21,NaT,
4842,857659,2023-04-30 07:48:39.390053,2023-07-11 09:11:26,2023-04-30 09:50:19,-300.0
4843,1578095,2023-07-11 13:25:58.942235,2023-07-11 13:29:13,NaT,


In [10]:
# 날짜형으로 변환 (혹시 이미 datetime이면 생략 가능)
working_merge['sign_up_date'] = pd.to_datetime(working_merge['sign_up_date'])
working_merge['first_vote_date'] = pd.to_datetime(working_merge['first_vote_date'])
working_merge['first_point_use_date'] = pd.to_datetime(working_merge['first_point_use_date'])

# 가입한 당일 투표했는지 여부
working_merge['vote_sameday_signup'] = (
    (working_merge['first_vote_date'].dt.floor('D') == working_merge['sign_up_date'].dt.floor('D'))
    & working_merge['first_vote_date'].notnull()
).astype(int)

# 가입한지 3일 이내 포인트 사용했는지 여부
working_merge['point_used_within_signup_3days'] = (
    ((working_merge['first_point_use_date'] - working_merge['sign_up_date']).dt.days <= 3)
    & working_merge['first_point_use_date'].notnull()
).astype(int)

In [11]:
working_merge

Unnamed: 0,user_id,sign_up_date,first_vote_date,first_point_use_date,first_point_use_amount,vote_sameday_signup,point_used_within_signup_3days
0,849436,2023-04-28 02:54:59.981972,2023-04-28 12:27:49,2023-04-28 17:44:51,-300.0,1,1
1,847375,2023-04-26 11:32:36.470049,2023-04-28 12:28:46,2023-04-29 09:09:36,-300.0,0,1
2,849438,2023-04-28 02:56:02.571340,2023-04-28 12:29:29,2023-04-30 13:37:52,-300.0,1,1
3,849479,2023-04-28 03:27:32.406440,2023-04-28 12:29:52,2023-04-28 14:31:21,-300.0,1,1
4,849441,2023-04-28 02:56:56.323854,2023-04-28 12:30:06,NaT,,1,0
...,...,...,...,...,...,...,...
4840,1577440,2023-06-29 08:36:52.982230,2023-06-29 09:03:48,2023-06-30 05:36:03,-200.0,1,1
4841,1577436,2023-06-29 08:03:16.561688,2023-07-05 06:29:21,NaT,,0,0
4842,857659,2023-04-30 07:48:39.390053,2023-07-11 09:11:26,2023-04-30 09:50:19,-300.0,0,1
4843,1578095,2023-07-11 13:25:58.942235,2023-07-11 13:29:13,NaT,,1,0


In [12]:
# 저장
working_merge.to_csv('./dataset/use_point_within_signup_N_days.csv', encoding='UTF-8', index=False)

# 투표 등장 대비 선택받은 비율

In [None]:
# 후보군 데이터프레임 불러오기
user_candidate_df = (
    ds.votes.polls_usercandidate()
    .query("created_at < '2023-08-01'")
)

In [55]:
user_candidate_df

Unnamed: 0,id,created_at,question_piece_id,user_id
0,3088872,2023-04-28 12:27:49,998458,849444
1,3088873,2023-04-28 12:27:49,998458,849454
2,3088874,2023-04-28 12:27:49,998458,849460
3,3088875,2023-04-28 12:27:49,998458,849469
4,3088964,2023-04-28 12:28:02,998459,849446
...,...,...,...,...
3973521,450451719,2023-05-25 23:59:51,143571051,1202769
3973522,450452696,2023-05-25 23:59:59,143571052,1059254
3973523,450452697,2023-05-25 23:59:59,143571052,1105625
3973524,450452698,2023-05-25 23:59:59,143571052,1161706


In [28]:
# 어라 왜? 한 질문에 여러개가 있지..?
user_candidate_df.groupby('question_piece_id').size().reset_index(name='count')['count'].unique()

array([ 4,  3,  2,  8,  1,  6, 16, 12])

In [29]:
temp = user_candidate_df.groupby('question_piece_id').size().reset_index(name='count')
verify_list = temp.loc[temp['count'] != 4].question_piece_id.unique()

In [30]:
question = ds.votes.polls_questionpiece()
record = ds.votes.accounts_userquestionrecord()

In [None]:
i = 0
exclude_qusetion = []

for question_piece_id in verify_list:
    temp = user_candidate_df.loc[user_candidate_df['question_piece_id'] == question_piece_id].copy()
    # 후보군이 넷 이상이면 상위 4개만 사용
    if temp.shape[0] > 4:
        temp['rank_in_group'] = temp.groupby('question_piece_id').cumcount()
        temp_top4 = temp[temp['rank_in_group'] < 4].copy()
        temp_top4.drop(columns='rank_in_group', inplace=True)
        
        # 상위 4개의 유저 리스트
        check_user = temp_top4['user_id'].unique()
    # 후보군이 넷 미만인 경우, 그대로 사용
    else:
        check_user = temp['user_id'].unique()

    chosen_user = record.loc[record['question_piece_id']== question_piece_id]['chosen_user_id']

    if chosen_user.isin(check_user).any() == True:
        i+=1
        if i % 1000 == 0:
            print('Running Successfully by 1000')
        continue
    else:
        print('Error')
        print(question_piece_id)
        exclude_qusetion.append(question_piece_id)
        continue

print('Loop End')

In [None]:
# 시각적 검증
for i in exclude_qusetion:
    check_num = i

    display(user_candidate_df.loc[user_candidate_df['question_piece_id'] == check_num])
    display(question.loc[question['id']== check_num])
    display(record.loc[record['question_piece_id']== check_num])
    print('='*50)
    print('='*50)


Unnamed: 0,id,created_at,question_piece_id,user_id
324590,11813124,2023-05-04 02:32:53,3795300,873573
324591,11813127,2023-05-04 02:32:53,3795300,877367


Unnamed: 0,id,is_voted,created_at,question_id,is_skipped
89339,3795300,1,2023-05-04 01:40:18,232,0


Unnamed: 0,id,status,created_at,chosen_user_id,question_id,user_id,question_piece_id,has_read,answer_status,answer_updated_at,report_count,opened_times




Unnamed: 0,id,created_at,question_piece_id,user_id
370451,12708253,2023-05-04 11:58:24,4098458,865079
370452,12708255,2023-05-04 11:58:24,4098458,869353
370453,12708256,2023-05-04 11:58:24,4098458,878751


Unnamed: 0,id,is_voted,created_at,question_id,is_skipped
102548,4098458,1,2023-05-04 11:58:15,329,0


Unnamed: 0,id,status,created_at,chosen_user_id,question_id,user_id,question_piece_id,has_read,answer_status,answer_updated_at,report_count,opened_times




Unnamed: 0,id,created_at,question_piece_id,user_id
2454796,219979072,2023-05-16 04:41:27,17269961,873680
2454797,219979074,2023-05-16 04:41:27,17269961,874872
2454798,219979075,2023-05-16 04:41:27,17269961,879918


Unnamed: 0,id,is_voted,created_at,question_id,is_skipped
302269,17269961,1,2023-05-09 10:18:23,548,0


Unnamed: 0,id,status,created_at,chosen_user_id,question_id,user_id,question_piece_id,has_read,answer_status,answer_updated_at,report_count,opened_times




Unnamed: 0,id,created_at,question_piece_id,user_id
2451597,219721616,2023-05-16 04:18:51,29763011,849490
2451598,219721617,2023-05-16 04:18:51,29763011,849543
2451599,219721618,2023-05-16 04:18:51,29763011,850728


Unnamed: 0,id,is_voted,created_at,question_id,is_skipped
364348,29763011,1,2023-05-11 09:01:32,203,0


Unnamed: 0,id,status,created_at,chosen_user_id,question_id,user_id,question_piece_id,has_read,answer_status,answer_updated_at,report_count,opened_times




Unnamed: 0,id,created_at,question_piece_id,user_id
2724272,254441953,2023-05-17 08:21:49,52789431,1023048
2724273,254441954,2023-05-17 08:21:49,52789431,1025949
2724274,254441955,2023-05-17 08:21:49,52789431,1037542


Unnamed: 0,id,is_voted,created_at,question_id,is_skipped
510485,52789431,1,2023-05-14 00:24:06,338,0


Unnamed: 0,id,status,created_at,chosen_user_id,question_id,user_id,question_piece_id,has_read,answer_status,answer_updated_at,report_count,opened_times




Unnamed: 0,id,created_at,question_piece_id,user_id
2503852,226061979,2023-05-16 08:59:48,52930048,1130903
2503853,226061980,2023-05-16 08:59:48,52930048,1167140
2503854,226061982,2023-05-16 08:59:48,52930048,1311277


Unnamed: 0,id,is_voted,created_at,question_id,is_skipped
511447,52930048,1,2023-05-14 00:46:44,115,0


Unnamed: 0,id,status,created_at,chosen_user_id,question_id,user_id,question_piece_id,has_read,answer_status,answer_updated_at,report_count,opened_times




Unnamed: 0,id,created_at,question_piece_id,user_id
3313598,339845167,2023-05-20 15:42:57,53475295,877667
3313599,339845168,2023-05-20 15:42:57,53475295,907463
3313600,339845169,2023-05-20 15:42:57,53475295,953051


Unnamed: 0,id,is_voted,created_at,question_id,is_skipped
515756,53475295,1,2023-05-14 02:01:57,643,0


Unnamed: 0,id,status,created_at,chosen_user_id,question_id,user_id,question_piece_id,has_read,answer_status,answer_updated_at,report_count,opened_times




Unnamed: 0,id,created_at,question_piece_id,user_id
2891207,277603879,2023-05-18 05:06:00,64314852,871079
2891208,277603880,2023-05-18 05:06:00,64314852,1143616
2891209,277603881,2023-05-18 05:06:00,64314852,1155868


Unnamed: 0,id,is_voted,created_at,question_id,is_skipped
602961,64314852,1,2023-05-15 06:41:45,264,0


Unnamed: 0,id,status,created_at,chosen_user_id,question_id,user_id,question_piece_id,has_read,answer_status,answer_updated_at,report_count,opened_times




Unnamed: 0,id,created_at,question_piece_id,user_id
3431996,358129099,2023-05-21 11:29:39,75060144,885305
3431997,358129100,2023-05-21 11:29:39,75060144,891498
3431998,358129101,2023-05-21 11:29:39,75060144,990101


Unnamed: 0,id,is_voted,created_at,question_id,is_skipped
680353,75060144,1,2023-05-16 11:10:34,616,0


Unnamed: 0,id,status,created_at,chosen_user_id,question_id,user_id,question_piece_id,has_read,answer_status,answer_updated_at,report_count,opened_times




Unnamed: 0,id,created_at,question_piece_id,user_id
3382774,350402902,2023-05-21 06:01:13,77297531,866364
3382775,350402903,2023-05-21 06:01:13,77297531,876170
3382776,350402904,2023-05-21 06:01:13,77297531,908877


Unnamed: 0,id,is_voted,created_at,question_id,is_skipped
693232,77297531,1,2023-05-16 14:06:46,862,0


Unnamed: 0,id,status,created_at,chosen_user_id,question_id,user_id,question_piece_id,has_read,answer_status,answer_updated_at,report_count,opened_times




Unnamed: 0,id,created_at,question_piece_id,user_id
2981671,290320959,2023-05-18 14:01:09,78976279,1178657
2981672,290320961,2023-05-18 14:01:09,78976279,1185828
2981673,290320962,2023-05-18 14:01:09,78976279,1208621


Unnamed: 0,id,is_voted,created_at,question_id,is_skipped
705293,78976279,1,2023-05-16 21:58:50,1041,0


Unnamed: 0,id,status,created_at,chosen_user_id,question_id,user_id,question_piece_id,has_read,answer_status,answer_updated_at,report_count,opened_times




Unnamed: 0,id,created_at,question_piece_id,user_id
3545476,375406845,2023-05-22 08:10:21,111324988,1147719
3545477,375406847,2023-05-22 08:10:21,111324988,1284861
3545478,375406848,2023-05-22 08:10:21,111324988,1329106


Unnamed: 0,id,is_voted,created_at,question_id,is_skipped
894412,111324988,1,2023-05-21 03:19:41,782,0


Unnamed: 0,id,status,created_at,chosen_user_id,question_id,user_id,question_piece_id,has_read,answer_status,answer_updated_at,report_count,opened_times




Unnamed: 0,id,created_at,question_piece_id,user_id
3687720,397608528,2023-05-23 09:26:14,127560840,1365270
3687721,397608530,2023-05-23 09:26:14,127560840,1370443
3687722,397608531,2023-05-23 09:26:14,127560840,1384004


Unnamed: 0,id,is_voted,created_at,question_id,is_skipped
981885,127560840,1,2023-05-23 09:24:53,265,0


Unnamed: 0,id,status,created_at,chosen_user_id,question_id,user_id,question_piece_id,has_read,answer_status,answer_updated_at,report_count,opened_times




In [None]:
# 유효한 질문 리스트
verify_list = verify_list[~np.isin(verify_list, exclude_qusetion)]

In [None]:
# 질문 목록 저장
with open('./dataset/verify_question.txt', 'w') as f:
    for item in verify_list:
        f.write(f"{item}\n")

In [None]:
# 제외 질문 id 저장
with open('./dataset/error_of_question_piece_id.txt', 'w') as f:
    for item in exclude_qusetion:
        f.write(f"{item}\n")

In [None]:
user_candidate_df.groupby('question_piece_id').cumcount()

0          0
1          1
2          2
3          3
4          0
          ..
3973521    3
3973522    0
3973523    1
3973524    2
3973525    3
Length: 3973526, dtype: int64

In [None]:
piece_counts = user_candidate_df['question_piece_id'].value_counts()
over4_ids = piece_counts[piece_counts > 4].index

# 4개미만인 경우 그대로, 4개 초과인 경우 테이블 적재 기준 상위 4개만 남겨놓기
user_candidate_df['rank_in_group'] = user_candidate_df.groupby('question_piece_id').cumcount()

user_candidate_df_top4 = user_candidate_df[
    (~user_candidate_df['question_piece_id'].isin(over4_ids)) | 
    ((user_candidate_df['question_piece_id'].isin(over4_ids)) & (user_candidate_df['rank_in_group'] < 4))
].copy()

# 유효한 질문 리스트
user_candidate_df_top4 = user_candidate_df_top4.loc[user_candidate_df_top4['question_piece_id'].isin(verify_list)]
user_candidate_df_top4

Unnamed: 0,id,created_at,question_piece_id,user_id,rank_in_group
24,3089341,2023-04-28 12:28:46,998689,849486,0
25,3089342,2023-04-28 12:28:46,998689,849682,1
26,3089343,2023-04-28 12:28:46,998689,849762,2
51,3089640,2023-04-28 12:29:30,998695,849445,0
52,3089641,2023-04-28 12:29:30,998695,849446,1
...,...,...,...,...,...
3973497,450442101,2023-05-25 23:58:39,142402246,1456891,1
3973498,450442103,2023-05-25 23:58:39,142402246,1503838,2
3973507,450447872,2023-05-25 23:59:21,142402269,1449191,0
3973508,450447873,2023-05-25 23:59:21,142402269,1466917,1


In [None]:
grouped_user_candidate_df = user_candidate_df_top4.groupby('user_id')['id'].size().reset_index(name='count')
grouped_user_candidate_df

Unnamed: 0,user_id,count
0,833113,2
1,833154,1
2,833203,4
3,833294,5
4,833424,1
...,...,...
10577,1508511,1
10578,1508709,13
10579,1508952,3
10580,1509934,1


In [45]:
verify_df = pd.read_csv('./dataset/not_included/verify_question.txt')
verify_df.loc[len(verify_df)] = 998595
verify_df = verify_df.rename(columns={'998595':'question_id'})

In [46]:
verify_list = verify_df.question_id.unique()

In [50]:
record = (
    ds.votes.accounts_userquestionrecord()
    .query('created_at <= "2023-05-25"')
)
record

Unnamed: 0,id,status,created_at,chosen_user_id,question_id,user_id,question_piece_id,has_read,answer_status,answer_updated_at,report_count,opened_times
0,771777,C,2023-04-28 12:27:49,849469,252,849436,998458,0,N,2023-04-28 12:27:49,0,0
1,771800,C,2023-04-28 12:28:02,849446,244,849436,998459,0,N,2023-04-28 12:28:02,0,0
2,771812,C,2023-04-28 12:28:09,849454,183,849436,998460,1,N,2023-04-28 12:28:09,0,0
3,771828,C,2023-04-28 12:28:16,847375,101,849436,998461,0,N,2023-04-28 12:28:16,0,0
4,771851,C,2023-04-28 12:28:26,849477,209,849436,998462,1,N,2023-04-28 12:28:26,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
993402,107819338,C,2023-05-24 23:59:18,1462945,1304,1449592,137740310,0,N,2023-05-24 23:59:17,0,0
993403,107819707,C,2023-05-24 23:59:27,1011262,897,1113651,136446171,1,N,2023-05-24 23:59:27,0,0
993404,107819940,C,2023-05-24 23:59:35,1391972,579,1372985,134636435,0,N,2023-05-24 23:59:35,0,0
993405,107820254,C,2023-05-24 23:59:44,1385215,1644,1372985,134636437,0,N,2023-05-24 23:59:44,0,0


In [56]:
record.loc[record['question_piece_id'].isin(verify_list)]

Unnamed: 0,id,status,created_at,chosen_user_id,question_id,user_id,question_piece_id,has_read,answer_status,answer_updated_at,report_count,opened_times
6,771894,C,2023-04-28 12:28:46,849682,247,847375,998689,0,N,2023-04-28 12:28:46,0,0
13,771969,C,2023-04-28 12:29:30,849446,133,847375,998695,0,N,2023-04-28 12:29:30,0,0
15,771981,C,2023-04-28 12:29:38,849864,152,847375,998697,0,N,2023-04-28 12:29:38,0,0
27,772057,C,2023-04-28 12:30:21,849455,138,849479,998941,0,N,2023-04-28 12:30:21,0,0
30,772098,C,2023-04-28 12:30:38,849566,274,849479,998943,0,N,2023-04-28 12:30:38,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
993375,107808504,C,2023-05-24 23:54:09,1446883,904,1223180,137711106,1,N,2023-05-24 23:54:09,0,0
993381,107809386,C,2023-05-24 23:54:32,855230,152,1223180,137711113,1,N,2023-05-24 23:54:32,0,0
993391,107810858,C,2023-05-24 23:55:11,1201068,166,1127911,135756612,1,A,2023-05-25 00:58:32,0,0
993393,107811375,C,2023-05-24 23:55:25,1040582,907,1111941,128649169,1,N,2023-05-24 23:55:25,0,0


In [None]:
grouped_record = record.loc[record['question_piece_id'].isin(verify_list)].groupby('chosen_user_id').size().reset_index(name='chosen_count')
grouped_record

Unnamed: 0,chosen_user_id,chosen_count
0,833203,2
1,833525,6
2,834485,4
3,835872,1
4,837235,4
...,...,...
7898,1504849,1
7899,1505477,1
7900,1505833,1
7901,1508709,8


In [None]:
# 병합
working_merge = pd.merge(
    grouped_user_candidate_df,
    grouped_record,
    left_on='user_id',
    right_on='chosen_user_id',
    how='outer'
)
# 비율 계산
working_merge['ratio'] = working_merge['chosen_count'] / working_merge['count']

# NAN값 중 count만 0으로 저장
working_merge[['count','chosen_count']] = working_merge[['count','chosen_count']].fillna(0)

# 컬럼 이름을 명확하게 구분
working_merge = working_merge.rename(columns={'count':'candidate_count'})
working_merge

Unnamed: 0,user_id,candidate_count,chosen_user_id,chosen_count,ratio
0,833113,2,,0.0,
1,833154,1,,0.0,
2,833203,4,833203.0,2.0,0.500000
3,833294,5,,0.0,
4,833424,1,,0.0,
...,...,...,...,...,...
10577,1508511,1,,0.0,
10578,1508709,13,1508709.0,8.0,0.615385
10579,1508952,3,1508952.0,2.0,0.666667
10580,1509934,1,,0.0,


In [None]:
working_merge.isna().sum()

user_id               0
candidate_count       0
chosen_user_id     2679
chosen_count          0
ratio              2679
dtype: int64

In [None]:
# 저장
working_merge.to_csv('./dataset/candidate_chosen_ratio.csv', index=False, encoding='UTF-8')

# 유저가 처음 투표한 시각