In [2]:
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.pool import NullPool
import matplotlib as plt
import matplotlib.pylab as plt
import seaborn as sns
from tqdm import tqdm
from PIL import Image
import requests
from io import BytesIO


In [4]:
q = '''
select uid
from service1_quicket.product_info
where create_date >= '2021-06-08 00:00:00' and create_date <= '2021-06-08 23:59:59'
group by 1
'''
upload_yesterday = pd.read_sql(q, con=bun_dw)


In [42]:
uid_list = upload_yesterday.uid.astype(str).to_list()

In [43]:
seller = pd.read_excel('powerSeller.xlsx', sheet_name='result', engine='openpyxl').uid.astype(str).to_list()
final_uid_list = []
for uid in uid_list:
    if uid not in seller:
        final_uid_list.append(uid)

In [37]:
len(uid_list)

34208

In [39]:
len(seller)

174929

In [44]:
len(final_uid_list)

31753

In [6]:
uids = ','.join('\'' + str(u) + '\'' for u in uid_list)

In [45]:
f_uids = ','.join('\'' + str(u) + '\'' for u in final_uid_list)

#### 파워셀러 점수 스코어링 피쳐들
- 라이브중인 상품 수
- 상품 등록 수
- 상품 등록 시간
- 업 사용 수

In [46]:
# 2020/06/09 기준
q = f'''
SELECT UID,
       count(CASE
                 WHEN status = 0 THEN 1
             END) AS live_cnt,
       count(DISTINCT id) AS p_cnt
FROM service1_quicket.product_info
WHERE UID IN ({uids})
GROUP BY 1
'''
products = pd.read_sql(q, con=bun_dw)

In [47]:
q = f'''
SELECT UID,
       min(extract(hour
                   FROM create_date)) AS earliest,
       max(extract(hour
                   FROM create_date)) AS latest,
       latest - earliest AS diff
FROM service1_quicket.product_info
WHERE UID IN ({uids})
  AND create_date BETWEEN '2021-06-06 00:00:00' AND '2021-06-08 23:59:59'
GROUP BY 1
'''
upload = pd.read_sql(q, con=bun_dw)

In [48]:
q = f'''
SELECT a.uid,
       a.top_category,
       a.p_cnt_in_top_category,
       b.distinct_category_cnt
FROM
  (SELECT UID,
          category AS top_category,
          p_cnt_in_category AS p_cnt_in_top_category
   FROM
     (SELECT UID,
             left(category_id, 3) AS category,
             count(DISTINCT id) AS p_cnt_in_category,
             row_number() OVER (PARTITION BY UID
                                ORDER BY count(DISTINCT id) DESC) AS ROW
      FROM service1_quicket.product_info
      WHERE UID IN ({uids}) AND status = 0
      GROUP BY 1,
               2)
   WHERE ROW = 1) a
JOIN
  (SELECT UID,
          count(DISTINCT left(category_id, 3)) AS distinct_category_cnt
   FROM service1_quicket.product_info
   WHERE UID IN ({uids})
   GROUP BY 1) b ON a.uid = b.uid
'''
category = pd.read_sql(q, con=bun_dw)

In [49]:
#일주일간의 up plus
q = f'''
SELECT UID,
       count(*) AS up_plus_purchase_cnt,
       sum(qty) AS up_plus_cnt,
       sum(pay_point) AS pay_point,
       sum(pay_free) AS pay_free,
       sum(pay_point) + sum(pay_free) AS total_point
FROM service1_quicket.ad_up_plus
WHERE UID IN ({uids})
  AND created_at >= '2021-06-02'
  AND created_at <= '2021-06-08'
GROUP BY 1'''
up_plus = pd.read_sql(q, con=bun_dw)

In [50]:
# 일주일간 SA
q = f'''
SELECT UID,
       count(DISTINCT id) AS ad_id_cnt,
       sum(total_budget) AS total_budget,
       sum(paid_budget) AS total_paid_budget,
       sum(free_budget) AS total_free_budget
FROM service1_quicket.ad_set
WHERE created_at >= '2021-06-02'
  AND created_at <= '2021-06-08'
  AND UID IN ({uids})
GROUP BY 1
'''
sa = pd.read_sql(q, con=bun_dw)

In [51]:
q = f'''
SELECT content_owner AS UID,
       count(*) AS total_click_cnt,
       count(DISTINCT content_id) AS p_cnt,
       cast(total_click_cnt AS float)/p_cnt AS avg_click_cnt
FROM bun_log_db.app_event_type_view
WHERE content_owner IN ({uids})
  AND YEAR||MONTH||DAY = '20210608'
GROUP BY 1
'''
click = pd.read_sql(q, con=bun_dw)

In [52]:
q = f'''
SELECT content_owner AS UID,
       count(*) AS total_imp_cnt,
       count(DISTINCT content_id) AS p_cnt,
       cast(total_imp_cnt AS float)/p_cnt AS avg_imp_cnt
FROM bun_log_db.app_event_type_impression
WHERE content_owner IN ({uids})
  AND YEAR||MONTH||DAY = '20210608'
GROUP BY 1
'''
imp = pd.read_sql(q, con=bun_dw)

In [53]:
q = f'''
SELECT id AS UID,
       favorite_count AS follower_cnt,
       review_count,
       grade
FROM service1_quicket.user_
WHERE id IN ({uids})
'''
user = pd.read_sql(q, con=bun_dw)

### 모든 테이블 JOIN

In [17]:
tables = [products, upload, category, up_plus, sa, click, imp, user]
for t in tables:
    t['uid'] = t['uid'].astype(str)

In [18]:
df = pd.merge(products, upload, on='uid', how='outer')
df = pd.merge(df, category , on='uid', how='outer')
df = pd.merge(df, up_plus, on='uid', how='outer')
df = pd.merge(df, sa, on='uid', how='outer')
df = pd.merge(df, user, on='uid', how='outer')

In [19]:
click.uid = click.uid.astype(str)
imp.uid = imp.uid.astype(str)

cm = pd.merge(click, imp, on='uid', how='outer')

In [20]:
df = pd.merge(df, cm, on='uid', how='outer')

In [21]:
df

Unnamed: 0,uid,live_cnt,p_cnt,earliest,latest,diff,top_category,p_cnt_in_top_category,distinct_category_cnt,up_plus_purchase_cnt,...,total_free_budget,follower_cnt,review_count,grade,total_click_cnt,p_cnt_x,avg_click_cnt,total_imp_cnt,p_cnt_y,avg_imp_cnt
0,3817615,8,2097,17,17,0,320,5.0,6.0,,...,,177,65,639,88.0,14.0,6.285714,1891.0,90.0,21.011111
1,2624438,19498,57170,15,20,5,400,18590.0,11.0,,...,,9535,2647,25647,7141.0,2716.0,2.629234,222389.0,14182.0,15.681075
2,3983431,35,17231,12,19,7,600,34.0,4.0,,...,,774,143,1375,225.0,51.0,4.411765,5649.0,87.0,64.931034
3,5099144,970,5404,0,0,0,320,856.0,6.0,,...,,3693,196,1911,555.0,243.0,2.283951,15951.0,1041.0,15.322767
4,5514480,1568,15973,20,21,1,320,1512.0,6.0,,...,,14719,702,6820,338.0,156.0,2.166667,16016.0,1481.0,10.814315
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34203,12962446,1,1,18,18,0,600,1.0,1.0,,...,,0,0,0,18.0,1.0,18.000000,653.0,1.0,653.000000
34204,9944671,0,1,21,21,0,,,,,...,,0,0,0,4.0,1.0,4.000000,8.0,1.0,8.000000
34205,76738082,0,1,15,15,0,,,,,...,,0,0,0,8.0,1.0,8.000000,46.0,1.0,46.000000
34206,76742270,0,1,9,9,0,,,,,...,,0,0,0,38.0,1.0,38.000000,515.0,1.0,515.000000


In [22]:
df.columns

Index(['uid', 'live_cnt', 'p_cnt', 'earliest', 'latest', 'diff',
       'top_category', 'p_cnt_in_top_category', 'distinct_category_cnt',
       'up_plus_purchase_cnt', 'up_plus_cnt', 'pay_point', 'pay_free',
       'total_point', 'ad_id_cnt', 'total_budget', 'total_paid_budget',
       'total_free_budget', 'follower_cnt', 'review_count', 'grade',
       'total_click_cnt', 'p_cnt_x', 'avg_click_cnt', 'total_imp_cnt',
       'p_cnt_y', 'avg_imp_cnt'],
      dtype='object')

In [23]:
df = df.fillna(0)

### SCORING

In [24]:
from sklearn.preprocessing import MinMaxScaler
cols = ['uid', 'live_cnt', 'p_cnt', 
        'up_plus_purchase_cnt', 'up_plus_cnt', 'total_point', 
        'ad_id_cnt', 'total_budget', 
        'follower_cnt', 'review_count', 'grade', 
        'total_click_cnt', 'avg_click_cnt', 'total_imp_cnt', 'avg_imp_cnt']
s = df[cols]

In [25]:
import numpy as np
s['if_work_hr'] = np.where(((df['diff']>=8) & (df['diff'] <= 10)) | (df['diff'] == 23), 0.5, 0)
s['if_category'] = np.where((df['p_cnt_in_top_category'] >= 10) & (df['distinct_category_cnt'] <= 3), 0.5, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [26]:
cols = ['live_cnt', 'p_cnt', 
        'up_plus_purchase_cnt', 'up_plus_cnt', 'total_point', 
        'ad_id_cnt', 'total_budget', 
        'follower_cnt', 'review_count', 'grade', 
        'total_click_cnt', 'avg_click_cnt', 'total_imp_cnt', 'avg_imp_cnt']
s[cols] = MinMaxScaler().fit_transform(s[cols])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())


In [27]:
s

Unnamed: 0,uid,live_cnt,p_cnt,up_plus_purchase_cnt,up_plus_cnt,total_point,ad_id_cnt,total_budget,follower_cnt,review_count,grade,total_click_cnt,avg_click_cnt,total_imp_cnt,avg_imp_cnt,if_work_hr,if_category
0,3817615,0.000410,0.036663,0.0,0.0,0.0,0.0,0.0,0.006492,0.005275,0.005363,0.006696,0.010459,0.007052,0.001521,0.0,0.0
1,2624438,1.000000,1.000000,0.0,0.0,0.0,0.0,0.0,0.349741,0.214802,0.215248,0.543331,0.004375,0.829315,0.001135,0.0,0.0
2,3983431,0.001795,0.301387,0.0,0.0,0.0,0.0,0.0,0.028390,0.011604,0.011540,0.017119,0.007341,0.021066,0.004702,0.0,0.0
3,5099144,0.049749,0.094509,0.0,0.0,0.0,0.0,0.0,0.135458,0.015905,0.016038,0.042228,0.003800,0.059483,0.001110,0.0,0.0
4,5514480,0.080419,0.279382,0.0,0.0,0.0,0.0,0.0,0.539889,0.056967,0.057238,0.025717,0.003605,0.059726,0.000783,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34203,12962446,0.000051,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.001370,0.029950,0.002435,0.047285,0.0,0.0
34204,9944671,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000304,0.006656,0.000030,0.000579,0.0,0.0
34205,76738082,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000609,0.013311,0.000172,0.003331,0.0,0.0
34206,76742270,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.002891,0.063228,0.001920,0.037292,0.0,0.0


In [28]:
s.columns

Index(['uid', 'live_cnt', 'p_cnt', 'up_plus_purchase_cnt', 'up_plus_cnt',
       'total_point', 'ad_id_cnt', 'total_budget', 'follower_cnt',
       'review_count', 'grade', 'total_click_cnt', 'avg_click_cnt',
       'total_imp_cnt', 'avg_imp_cnt', 'if_work_hr', 'if_category'],
      dtype='object')

In [29]:
def score(live_cnt, p_cnt, up_plus_purchase_cnt, up_plus_cnt, total_point, ad_id_cnt, total_budget, follower_cnt, review_count, grade, avg_click_cnt, avg_imp_cnt, if_work_hr, if_category):
    sc = s['live_cnt'] * live_cnt \
    + s['p_cnt'] * p_cnt \
    + s['up_plus_purchase_cnt'] * up_plus_purchase_cnt \
    + s['up_plus_cnt'] * up_plus_cnt \
    + s['total_point'] * total_point \
    + s['ad_id_cnt'] * ad_id_cnt \
    + s['total_budget'] * total_budget \
    + s['follower_cnt'] * follower_cnt \
    + s['review_count'] * review_count \
    + s['grade'] * grade \
    + s['avg_click_cnt'] * avg_click_cnt \
    + s['avg_imp_cnt'] * avg_imp_cnt \
    + s['if_work_hr'] * if_work_hr \
    + s['if_category'] * if_category
    return sc

In [30]:
s['score'] = score(5, 1, 2, 3, 3, 6, 3, 4, 4, 3, 5, 2, 5, 5)
features = ['live_cnt', 'p_cnt', 'up_plus_purchase_cnt', 'up_plus_cnt', 'total_point', 'ad_id_cnt', 'total_budget', 'follower_cnt', 'review_count', 'grade', 'avg_click_cnt', 'avg_imp_cnt', 'if_work_hr', 'if_category']
s.sort_values('score', ascending=False)[['uid', 'score'] + features][:30]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,uid,score,live_cnt,p_cnt,up_plus_purchase_cnt,up_plus_cnt,total_point,ad_id_cnt,total_budget,follower_cnt,review_count,grade,avg_click_cnt,avg_imp_cnt,if_work_hr,if_category
18573,3947717,15.394753,0.021746,0.087565,0.666667,0.45,0.46125,0.956897,0.244933,0.205297,0.14688,0.142987,0.057098,0.016021,0.5,0.0
18260,7658544,11.870703,0.036773,0.107681,0.666667,1.0,1.0,0.112069,0.217922,0.030114,0.033596,0.03441,0.010078,0.005591,0.5,0.0
168,592959,11.175945,0.006667,0.049327,0.0,0.0,0.0,0.043103,0.026497,0.912959,1.0,1.0,0.019099,0.003919,0.0,0.0
1821,6972746,10.902204,0.004257,0.057794,0.0,0.0,0.0,1.0,0.410401,0.133954,0.05932,0.059454,0.026716,0.003442,0.0,0.5
1764,1393711,10.320827,0.065391,0.247739,0.333333,0.25,0.25,0.155172,0.052558,0.716025,0.146393,0.1472,0.016628,0.008172,0.5,0.0
2569,6310647,9.962528,0.061647,0.044447,0.5,0.75,0.75,0.224138,0.012547,0.018046,0.016717,0.017297,0.005625,0.004153,0.0,0.5
1,2624438,8.92806,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.349741,0.214802,0.215248,0.004375,0.001135,0.0,0.0
14,3674727,7.776388,0.62073,0.281534,0.0,0.0,0.0,0.0,0.0,1.0,0.052017,0.05274,0.004437,0.001367,0.0,0.0
18593,3806667,7.312006,0.017951,0.013889,0.0,0.0,0.0,0.37931,0.327601,0.113047,0.131056,0.134955,0.005152,0.021331,0.5,0.0
25957,1545291,7.127081,0.393938,0.138432,0.0,0.0,0.0,0.0,0.0,0.001577,0.000162,0.000168,0.002219,0.000203,0.5,0.5


In [32]:
s.sort_values('score')[['uid', 'score']][:30]

Unnamed: 0,uid,score
31526,9545097,0.000256
34045,74481925,0.000256
30820,10209127,0.000419
28317,75930121,0.000452
25993,1776827,0.000634
17741,2531820,0.000998
19262,7766024,0.001083
8529,75311369,0.001129
28913,74522314,0.00116
17263,76013177,0.001288


In [34]:
s.sort_values('score', ascending=False)[['uid', 'score']].to_csv("power_seller_score_2.csv", index=False)

In [33]:
len(uid_list)

34208

In [56]:
s[s['uid'].isin(final_uid_list)].sort_values('score', ascending=False)[['uid', 'score']].to_csv("power_seller_score_3.csv", index=False)