In [2]:
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.pool import NullPool
import matplotlib as plt
import matplotlib.pylab as plt
import seaborn as sns
from tqdm import tqdm
from PIL import Image
import requests
from io import BytesIO



### 2021년 2월 15일 하루 키워드별 ctr

In [19]:
q = """
SELECT i.ref_term,
       count(DISTINCT i.ad_set_product_id) AS imp_pcnt,
       count(i.imp_id) AS imp_cnt,
       count(c.imp_id) AS click_cnt,
       click_cnt/imp_cnt::float AS ctr
FROM
  (SELECT imp_id,
          ad_set_product_id,
          trim(ref_term) AS ref_term
   FROM bun_log_db.api_event_type_impression_ad
   WHERE YEAR||MONTH||DAY = '20210215'
     AND page_id = '검색결과') i
LEFT JOIN
  (SELECT imp_id
   FROM bun_log_db.api_event_type_click_ad
   WHERE YEAR||MONTH||DAY = '20210215') c ON i.imp_id = c.imp_id
GROUP BY 1
"""
daily = pd.read_sql(q, con=bun_dw)

In [21]:
daily.to_csv("daily_keyword_ctr.csv", index=False)

In [22]:
daily.shape

(124677, 5)

In [26]:
daily.describe()

Unnamed: 0,imp_pcnt,imp_cnt,click_cnt,ctr
count,124677.0,124677.0,124677.0,124677.0
mean,7.426157,26.727953,0.651227,0.032035
std,13.100521,306.223441,9.148504,0.133377
min,1.0,1.0,0.0,0.0
25%,1.0,2.0,0.0,0.0
50%,3.0,4.0,0.0,0.0
75%,8.0,11.0,0.0,0.0
max,429.0,40064.0,1264.0,1.0


In [25]:
daily.quantile([0.8, 0.9, 0.95, 0.99])

Unnamed: 0,imp_pcnt,imp_cnt,click_cnt,ctr
0.8,10.0,15.0,0.0,0.0
0.9,18.0,31.0,1.0,0.047619
0.95,30.0,58.0,2.0,0.166667
0.99,62.0,304.0,9.0,1.0


In [35]:
q = '''
SELECT i.ref_term,
       i.ad_set_product_id,
       count(i.imp_id) AS imp_cnt,
       count(c.imp_id) AS click_cnt,
       click_cnt/imp_cnt::float AS ctr
FROM
  (SELECT imp_id,
          ad_set_product_id,
          trim(ref_term) AS ref_term
   FROM bun_log_db.api_event_type_impression_ad
   WHERE YEAR||MONTH||DAY = '20210215'
     AND page_id = '검색결과') i
LEFT JOIN
  (SELECT imp_id
   FROM bun_log_db.api_event_type_click_ad
   WHERE YEAR||MONTH||DAY = '20210215') c ON i.imp_id = c.imp_id
GROUP BY 1, 2
'''

keyword_product_ctr = pd.read_sql(q, con=bun_dw)

In [39]:
df = pd.merge(daily, keyword_product_ctr.groupby('ref_term', as_index=False)['ctr'].mean(), on='ref_term')
df.columns = ['ref_term', 'imp_pcnt', 'imp_cnt', 'click_cnt', 'keyword_ctr', 'mean_keyword_product_ctr']

In [44]:
df.to_csv("keyword_ctr.csv", index=False)

In [41]:
#impression count가 상위 5% 안에 들어가는 키워드 vs 안들어가는 키워드
top = df[df['imp_cnt'] >= df['imp_cnt'].quantile(0.95)]
no_top = df[df['imp_cnt'] < df['imp_cnt'].quantile(0.95)]

In [43]:
top.describe()

Unnamed: 0,imp_pcnt,imp_cnt,click_cnt,keyword_ctr,mean_keyword_product_ctr
count,6365.0,6365.0,6365.0,6365.0,6365.0
mean,37.657816,375.935271,9.519246,0.022203,0.022499
std,35.525809,1306.36211,39.364848,0.031464,0.033638
min,1.0,58.0,0.0,0.0,0.0
25%,10.0,75.0,0.0,0.0,0.0
50%,29.0,112.0,2.0,0.0125,0.010417
75%,57.0,233.0,6.0,0.030675,0.031396
max,429.0,40064.0,1264.0,0.391892,0.391892


In [45]:
top.sort_values('imp_pcnt')

Unnamed: 0,ref_term,imp_pcnt,imp_cnt,click_cnt,keyword_ctr,mean_keyword_product_ctr
10569,저스트댄스,1,75,1,0.013333,0.013333
14111,나스,1,83,0,0.000000,0.000000
187,턱걸이,1,176,12,0.068182,0.068182
14234,s1000rr,1,127,2,0.015748,0.015748
1664,수석,1,190,12,0.063158,0.063158
...,...,...,...,...,...,...
864,엔지니어드가먼츠,291,4187,30,0.007165,0.005598
6,아이폰,298,40064,950,0.023712,0.025524
2161,갤럭시,304,8520,267,0.031338,0.025431
999,국내배송,326,2129,57,0.026773,0.032668


- 노출수 기준 상위 5%로 나누는건 노출 상품 수가 적어서 좋은 분류 방법 x

In [46]:
#impression product count가 상위 25% 안에 들어가는 키워드 vs 안들어가는 키워드
top = df[df['imp_pcnt'] >= df['imp_pcnt'].quantile(0.75)]
no_top = df[df['imp_pcnt'] < df['imp_pcnt'].quantile(0.75)]

In [47]:
top

Unnamed: 0,ref_term,imp_pcnt,imp_cnt,click_cnt,keyword_ctr,mean_keyword_product_ctr
0,샤넬,153,12049,174,0.014441,0.019016
2,갤럭시z플립,65,4288,200,0.046642,0.040344
3,신세계백화점 상품,36,105,7,0.066667,0.035301
6,아이폰,298,40064,950,0.023712,0.025524
7,보이져,12,957,21,0.021944,0.051239
...,...,...,...,...,...,...
115891,투바투 시그 교환,11,11,0,0.000000,0.000000
115997,시나모롤 룸,9,9,0,0.000000,0.000000
116776,여성용 진동,8,8,0,0.000000,0.000000
117083,캐시미어 뷔스티에,8,8,0,0.000000,0.000000


In [48]:
#impression product count가 상위 5% 안에 들어가는 키워드 vs 안들어가는 키워드
top = df[df['imp_pcnt'] >= df['imp_pcnt'].quantile(0.95)]
no_top = df[df['imp_pcnt'] < df['imp_pcnt'].quantile(0.95)]

In [50]:
top.sort_values('imp_pcnt')

Unnamed: 0,ref_term,imp_pcnt,imp_cnt,click_cnt,keyword_ctr,mean_keyword_product_ctr
14538,떨스데이,30,68,0,0.000000,0.000000
49170,미젤로 미션,30,30,0,0.000000,0.000000
36865,투웨이 후드집업,30,46,0,0.000000,0.000000
11203,웨이션 시그 포카,30,30,0,0.000000,0.000000
33478,이레귤러 재현,30,43,0,0.000000,0.000000
...,...,...,...,...,...,...
864,엔지니어드가먼츠,291,4187,30,0.007165,0.005598
6,아이폰,298,40064,950,0.023712,0.025524
2161,갤럭시,304,8520,267,0.031338,0.025431
999,국내배송,326,2129,57,0.026773,0.032668


In [52]:
#impression product count가 상위 5% 안에 들어가는 키워드 vs 안들어가는 키워드
top = df[df['imp_pcnt'] >= df['imp_pcnt'].quantile(0.95)]
no_top = df[df['imp_pcnt'] < df['imp_pcnt'].quantile(0.95)]

(124676, 6)

In [51]:
df[df['mean_keyword_product_ctr'] > 0]

Unnamed: 0,ref_term,imp_pcnt,imp_cnt,click_cnt,keyword_ctr,mean_keyword_product_ctr
0,샤넬,153,12049,174,0.014441,0.019016
1,빅톤,3,454,17,0.037445,0.058893
2,갤럭시z플립,65,4288,200,0.046642,0.040344
3,신세계백화점 상품,36,105,7,0.066667,0.035301
4,본체 i5,3,3,2,0.666667,0.666667
...,...,...,...,...,...,...
124446,g pro x superlight,1,1,1,1.000000,1.000000
124568,공기청정기 번개페이,1,1,1,1.000000,1.000000
124569,유니파잉,1,1,1,1.000000,1.000000
124570,인켈오디오셋트,1,1,1,1.000000,1.000000


In [79]:
df[df['imp_pcnt'] >= 20].describe()

Unnamed: 0,imp_pcnt,imp_cnt,click_cnt,keyword_ctr,mean_keyword_product_ctr
count,11422.0,11422.0,11422.0,11422.0,11422.0
mean,39.12581,176.714761,3.833567,0.010845,0.010459
std,24.415314,974.336171,29.086152,0.024473,0.024023
min,20.0,20.0,0.0,0.0,0.0
25%,24.0,29.0,0.0,0.0,0.0
50%,32.0,42.0,0.0,0.0,0.0
75%,46.0,75.0,1.0,0.013875,0.011116
max,429.0,40064.0,1264.0,0.521739,0.5


## 비광고 상품 

In [59]:
q = '''
SELECT i.ref_term,
       count(DISTINCT i.content_id) AS imp_nonad_pcnt,
       count(i.imp_id) AS non_ad_imp_cnt,
       count(c.imp_id) AS non_ad_click_cnt,
       non_ad_click_cnt/non_ad_imp_cnt::float AS non_ad_ctr
FROM
  (SELECT imp_id,
          content_id,
          trim(ref_term) AS ref_term
   FROM bun_log_db.app_event_type_impression
   WHERE YEAR||MONTH||DAY = '20210215'
     AND page_id = '검색결과'
     AND ref_source IS NULL) i
LEFT JOIN
  (SELECT imp_id
   FROM bun_log_db.app_event_type_view
   WHERE YEAR||MONTH||DAY = '20210215') c ON i.imp_id = c.imp_id
GROUP BY 1
HAVING count(i.imp_id) > 0
'''
daily_nonad = pd.read_sql(q, con=bun_dw)

In [61]:
q = '''
SELECT i.ref_term,
       i.content_id,
       count(i.imp_id) AS non_ad_imp_cnt,
       count(c.imp_id) AS non_ad_click_cnt,
       non_ad_click_cnt/non_ad_imp_cnt::float AS non_ad_ctr
FROM
  (SELECT imp_id,
          content_id,
          trim(ref_term) AS ref_term
   FROM bun_log_db.app_event_type_impression
   WHERE YEAR||MONTH||DAY = '20210215'
     AND page_id = '검색결과'
     AND ref_source IS NULL) i
LEFT JOIN
  (SELECT imp_id
   FROM bun_log_db.app_event_type_view
   WHERE YEAR||MONTH||DAY = '20210215') c ON i.imp_id = c.imp_id
GROUP BY 1, 2
HAVING count(i.imp_id) > 0
'''
keyword_product_ctr_nonad = pd.read_sql(q, con=bun_dw)

In [65]:
daily_nonad.to_csv('daily_nonad.csv', index=False)
keyword_product_ctr_nonad.to_csv('keyword_product_ctr_nonad.csv', index=False)

In [62]:
nonad_df = pd.merge(daily_nonad, keyword_product_ctr_nonad.groupby('ref_term', as_index=False)['non_ad_ctr'].mean(), on='ref_term')
nonad_df.columns = ['ref_term', 'imp_nonad_pcnt', 'nonad_imp_cnt', 'nonad_click_cnt', 'nonad_keyword_ctr', 'non_ad_mean_keyword_product_ctr']

In [63]:
nonad_df

Unnamed: 0,ref_term,imp_nonad_pcnt,nonad_imp_cnt,nonad_click_cnt,nonad_keyword_ctr,non_ad_mean_keyword_product_ctr
0,!i5 9400,264,306,2,0.006536,0.007576
1,!국내발송!,48,71,2,0.028169,0.026042
2,!국내발송! 샤넬,53,57,10,0.175439,0.169811
3,#235,59,62,4,0.064516,0.067797
4,#K7,119,122,1,0.008197,0.008403
...,...,...,...,...,...,...
531535,힛스,8,9,0,0.000000,0.000000
531536,힛홉,1,1,1,1.000000,1.000000
531537,힢마,56,217,20,0.092166,0.069460
531538,힢마 만쥬,6,6,3,0.500000,0.500000


In [68]:
data = pd.merge(nonad_df, df, on='ref_term', how='outer')[['ref_term', 'imp_nonad_pcnt', 'imp_pcnt', 'nonad_keyword_ctr', 'non_ad_mean_keyword_product_ctr', 'keyword_ctr', 'mean_keyword_product_ctr']]

In [73]:
# na_ : no ad
# a_ : ad 
data.columns = ['ref_term', 'na_imp_pcnt', 'a_imp_pcnt', 'na_keyword_ctr', 'na_mean_keyword_product_ctr', 'a_keyword_ctr', 'a_mean_keyword_product_ctr']

In [75]:
data.describe()

Unnamed: 0,na_imp_pcnt,a_imp_pcnt,na_keyword_ctr,na_mean_keyword_product_ctr,a_keyword_ctr,a_mean_keyword_product_ctr
count,531540.0,124676.0,531540.0,531540.0,124676.0,124676.0
mean,73.2279,7.426193,0.134151,0.128629,0.032035,0.031796
std,356.434502,13.100567,0.217641,0.215998,0.133377,0.133107
min,1.0,1.0,0.0,0.0,0.0,0.0
25%,5.0,1.0,0.0,0.0,0.0,0.0
50%,16.0,3.0,0.05,0.044118,0.0,0.0
75%,54.0,8.0,0.1537,0.142857,0.0,0.0
max,57149.0,429.0,1.0,1.0,1.0,1.0
