In [5]:
import math
import time
import psycopg2
import pandas as pd
import numpy as np
from datetime import datetime
from scipy.sparse import csr_matrix
from fast_pagerank import pagerank
from datetime import datetime, timedelta

In [7]:
n = 0.6 # min confidence; 1/search_cnt^n
min_lift = 1
confidence_weight = 0.7 
lift_weight = 0.3
initial_date = "{:%Y%m%d}".format(datetime.now() - timedelta(days=6))
final_date = "{:%Y%m%d}".format(datetime.now() - timedelta(days=1))

top_k = 20

*밑 쿼리는 이미 실행했으니 실행 안해도됨

In [None]:
q = f'''
DROP TABLE IF EXISTS workspace.search_term_log_modified;


CREATE TABLE workspace.search_term_log_modified AS
SELECT user_id,
       session_id,
       lower(trim(regexp_replace(regexp_replace(search_term, '\\r|\\n', ' '), '\\s+', ' '))) AS keyword,
       LAG(keyword, 1) OVER (PARTITION BY user_id ORDER BY server_time_kst) AS prev_keyword,
       LEAD(keyword, 1) OVER (PARTITION BY user_id ORDER BY server_time_kst) AS next_keyword,
       REPLACE(search_term, ' ', '') AS keyword_no_space,
       REPLACE(prev_keyword, ' ', '') AS prev_keyword_no_space,
       REPLACE(next_keyword, ' ', '') AS next_keyword_no_space,
       server_time_kst::TIMESTAMP AS new_server_time,
       LAG(server_time_kst, 1) OVER (PARTITION BY user_id ORDER BY server_time_kst)::TIMESTAMP AS prev_server_time,
       LEAD(server_time_kst, 1) OVER (PARTITION BY user_id ORDER BY server_time_kst)::TIMESTAMP AS next_server_time,
       datediff(SECOND, prev_server_time, new_server_time) AS cur_prev_duration,
       datediff(SECOND, prev_server_time, next_server_time) AS prev_next_duration
FROM bun_log_db.app_event_type_search
WHERE YEAR || MONTH || DAY >= {initial_date}
  AND YEAR || MONTH || DAY < {final_date}
  AND search_term IS NOT NULL
  AND search_term != ''
  AND search_term != ' '
  AND user_id > 0
  AND device_type IN ('a',
                      'i');


DROP TABLE IF EXISTS workspace.keyword_view_count;


CREATE TABLE workspace.keyword_view_count AS
SELECT lower(trim(regexp_replace(regexp_replace(ref_term, '\\r|\\n', ' '), '\\s+', ' '))) AS keyword,
       COUNT(*) AS view_count
FROM bun_log_db.app_event_type_view v
WHERE YEAR || MONTH || DAY >= {initial_date}
  AND YEAR || MONTH || DAY < {final_date}
  AND event_action = 'view_content'
  AND content_type = 'product'
  AND v.ref_term IS NOT NULL
  AND v.ref_term != ''
  AND v.ref_term != ' '
  AND device_type IN ('a',
                      'i')
GROUP BY 1;


DROP TABLE IF EXISTS workspace.search_results_count;


CREATE TABLE workspace.search_results_count AS
SELECT lower(trim(regexp_replace(regexp_replace(search_term, '\\r|\\n', ' '), '\\s+', ' '))) AS keyword,
       MAX((search_results_count):: int) AS results_count
FROM bun_log_db.api_event_type_search
WHERE YEAR || MONTH || DAY >= {initial_date}
  AND YEAR || MONTH || DAY < {final_date}
  AND device_type IN ('a',
                      'i')
GROUP BY 1;


DROP TABLE IF EXISTS workspace.keyword_search_count;


CREATE TABLE workspace.keyword_search_count AS
SELECT keyword,
       count(*) AS search_count
FROM workspace.search_term_log_modified
GROUP BY 1;


DROP TABLE IF EXISTS workspace.related_keyword_prev_cur;


CREATE TABLE workspace.related_keyword_prev_cur AS
SELECT prev_keyword AS keyword,
       keyword AS related_keyword,
       COUNT(*) AS COUNT
FROM workspace.search_term_log_modified
WHERE prev_keyword IS NOT NULL
  AND cur_prev_duration <= 600
  AND cur_prev_duration >= 1
  AND keyword_no_space != prev_keyword_no_space
GROUP BY 1,
         2;


DROP TABLE IF EXISTS workspace.related_keyword_prev_next;


CREATE TABLE workspace.related_keyword_prev_next AS
SELECT prev_keyword AS keyword,
       next_keyword AS related_keyword,
       COUNT(*) AS COUNT
FROM workspace.search_term_log_modified
WHERE prev_keyword IS NOT NULL
  AND cur_prev_duration <= 600
  AND cur_prev_duration >= 1
  AND next_keyword_no_space != prev_keyword_no_space
GROUP BY 1,
         2;


DROP TABLE IF EXISTS workspace.related_keyword_combined;


CREATE TABLE workspace.related_keyword_combined AS
SELECT *
FROM
  (SELECT c.keyword,
          c.related_keyword,
          c.count + n.count AS related_count,

     (SELECT COUNT(*)
      FROM search_term_log_modified) AS total_search_count,
          sc.search_count,
          rsc.search_count AS related_search_count,
          related_count / total_search_count :: float AS support,
          related_count / sc.search_count :: float AS confidence,
          confidence / (rsc.search_count / total_search_count :: float) AS lift
   FROM workspace.related_keyword_prev_cur c
   JOIN workspace.related_keyword_prev_next n ON c.keyword = n.keyword
   AND c.related_keyword = n.related_keyword
   JOIN workspace.keyword_search_count sc ON c.keyword = sc.keyword
   JOIN workspace.keyword_search_count rsc ON c.related_keyword = rsc.keyword) a
WHERE related_count > 2;


DROP TABLE IF EXISTS workspace.related_keyword_df_temp;


CREATE TABLE workspace.related_keyword_df_temp AS
SELECT k.keyword,
       k.related_keyword,
       k.related_count,
       k.search_count,
       k.related_search_count,
       rc.results_count,
       v.view_count AS related_view_count,
       k.total_search_count,
       k.support,
       k.confidence,
       k.lift
FROM workspace.related_keyword_combined k
JOIN workspace.search_results_count rc ON k.related_keyword = rc.keyword
JOIN workspace.keyword_view_count v ON k.related_keyword = v.keyword
WHERE results_count > 1
ORDER BY 4 DESC,
         1,
         10 DESC;


DROP TABLE IF EXISTS workspace.related_keyword_df_filtered;


CREATE TABLE workspace.related_keyword_df_filtered AS
SELECT *
FROM workspace.related_keyword_df_temp
WHERE confidence > 1 / pow(search_count, {n})
  AND lift > {min_lift};


DROP TABLE IF EXISTS workspace.related_keyword_df_max;


CREATE TABLE workspace.related_keyword_df_max AS
SELECT keyword,
       max(confidence) AS max_confidence,
       max(lift) AS max_lift
FROM workspace.related_keyword_df_filtered
GROUP BY 1;


DROP TABLE IF EXISTS workspace.related_keyword_df;


CREATE TABLE workspace.related_keyword_df AS
SELECT a.keyword,
       a.related_keyword,
       a.related_count,
       a.search_count,
       ROUND(({confidence_weight} * a.confidence / b.max_confidence:: float) + ({lift_weight} * a.lift / b.max_lift:: float),
             4) AS score
FROM workspace.related_keyword_df_filtered a
JOIN workspace.related_keyword_df_max b ON a.keyword = b.keyword
ORDER BY a.search_count DESC,
         a.keyword,
         score DESC;'''
cur.execute(query)
conn.commit()
cur.close()
conn.close()

In [5]:
conn = psycopg2.connect(dbname='bunjang', host='172.31.133.162',
                       port='5439', user=user_id, password=pwd)

query = f"""
SELECT *
FROM workspace.related_keyword_df;
"""

df = pd.read_sql(query, conn)
conn.close()


In [6]:
df.head()

Unnamed: 0,keyword,related_keyword,related_count,search_count,related_search_count,results_count,related_view_count,total_search_count,support,confidence,lift,score
0,자전거,로드자전거,368,17308,4815,1070,23292,10242675,3.6e-05,0.021262,45.229109,0.8029
1,자전거,mtb자전거,239,17308,2649,1154,10880,10242675,2.3e-05,0.013809,53.39277,0.5761
2,자전거,중고자전거,78,17308,350,123,1344,10242675,8e-06,0.004507,131.88429,0.4484
3,자전거,전기자전거,174,17308,3947,1185,14181,10242675,1.7e-05,0.010053,26.088471,0.3903
4,자전거,중고전기자전거,111,17308,934,14,989,10242675,1.1e-05,0.006413,70.330322,0.3711


In [101]:
df.groupby('keyword', as_index=False)['related_keyword'].nunique().describe()

Unnamed: 0,related_keyword
count,72846.0
mean,1.401436
std,1.245839
min,1.0
25%,1.0
50%,1.0
75%,1.0
max,32.0


In [27]:
keyword_dict = {}
for idx, row in df.iterrows():
    tmp = keyword_dict.get(row.keyword, [])
    if row.related_keyword.replace(' ', '') not in [k[0].replace(' ', '') for k in tmp]:
        tmp.append([row.related_keyword, (float(row.score), float(row.related_count))])
        keyword_dict[row.keyword] = tmp
        
final_keyword_dict = {}
for k, v in keyword_dict.items():
    if len(v) > 0:
        sorted_list = sorted(v, key=lambda x: x[1][0], reverse=True)[:top_k]
        final_keyword_dict[k] = {i[0]: i[1][1] for i in sorted_list}

In [26]:
def get_related_keyword_candidates(keyword):
    ranking_list = []
    candidate_list = []
    for related_keyword, related_count in final_keyword_dict[keyword].items():
        ranking_list.append([keyword, related_keyword, related_count])
        if related_keyword in final_keyword_dict:
            candidate_list.append(related_keyword)
    i = 0
    while i < 2:
        new_candidate_list = []
        for candidate in candidate_list:
            for related_keyword, related_count in final_keyword_dict[candidate].items():
                if [candidate, related_keyword, related_count] not in ranking_list:
                    ranking_list.append([candidate, related_keyword, related_count])
                if related_keyword in final_keyword_dict:
                    new_candidate_list.append(related_keyword)
        candidate_list = new_candidate_list
        i += 1
        
    return ranking_list

In [69]:
candidate_dict = {}
for keyword, related_keyword_dict in final_keyword_dict.items():
    if len(related_keyword_dict) < top_k:
        candidate_dict[keyword] = get_related_keyword_candidates(keyword)

In [None]:
0

In [85]:
# add more keywords after getting related keywords by score function
def get_additional_related_keywords(keyword):
    related_keyword_list = candidate_dict[keyword]

    node_list = []
    for i in related_keyword_list:
        node_list += i[:2]
    node_list = sorted(list(set(node_list)))

    node_to_idx = {node: idx for idx, node in enumerate(node_list)}
    idx_to_node = {idx: node for idx, node in enumerate(node_list)}

    A = []
    related_counts = []
    for fr, to, related_count in related_keyword_list:
        A.append([node_to_idx[fr], node_to_idx[to]])
        related_counts.append(related_count)

    A = np.array(A)
    G = csr_matrix((related_counts, (A[:, 0], A[:, 1])), shape=(len(node_list), len(node_list)))
    pr = pagerank(G, p=0.85) # p: damping factor

    rank = {idx_to_node[idx]:value for idx, value in enumerate(pr)}

    additional_keyword_list = []
    for related_keyword, value in sorted(rank.items(), key=lambda x:x[1], reverse=True):
        if related_keyword.replace(' ',  '') == keyword.replace(' ', ''):
            continue
        if related_keyword.replace(' ', '') in [k.replace(' ', '') for k in final_keyword_dict[keyword]]:
            continue
        if related_keyword.replace(' ', '') in [k.replace(' ', '') for k in additional_keyword_list]:
            continue
        if len(additional_keyword_list) < top_k:
            additional_keyword_list.append(related_keyword)
        else:
            break
            
    
    return additional_keyword_list

In [79]:
start = time.time()

df_final_keyword_dict = {}
for keyword, related_keyword_dict in final_keyword_dict.items():
    if len(related_keyword_dict) >= top_k:
        df_final_keyword_dict[keyword] = list(related_keyword_dict.keys())
    else:
        tmp = list(related_keyword_dict.keys())
        k = top_k - len(tmp)
        tmp += get_additional_related_keywords(keyword)[:k]
        df_final_keyword_dict[keyword] = tmp

print(f'Time taken: {time.time() - start}')

Time taken: 108.65906620025635


In [87]:
column_name = []
for idx in range(top_k):
    column_name.append('related_keyword_{}'.format(idx + 1))


In [94]:
search_count = df.groupby('keyword', as_index=False)['search_count'].mean()

In [91]:
temp_related_keyword_df = pd.DataFrame.from_dict(df_final_keyword_dict, orient='index', columns=column_name)
related_keyword_df1 = pd.merge(search_count, temp_related_keyword_df, left_on = 'keyword', right_index=True).sort_values('search_count', ascending=False)


In [102]:
related_keyword_df1.to_csv('related_keyword_df1.csv', encoding='utf-8', index=False)