In [19]:
from tqdm import tqdm
import pandas as pd
import random
import os
import numpy as np
import joblib
import logging
import coloredlogs
import time
import gc
import sys
import argparse
import itertools
from collections import Counter

In [20]:
# 載入資料和模型
start_dt = 12
官方指認欄位 = ['2','6','10','12','13','15','18','19','21','22','25','26','36','37','39','48']
nbrs = joblib.load('../model/nbrs.pkl')
X_pca = joblib.load('../model/X_pca_for_knn.pkl')
df_groupby_chid_preprocessed = pd.read_feather('../data/df_groupby_chid_preprocessed.feather')
df = pd.read_feather('../data/2021玉山人工智慧公開挑戰賽冬季賽訓練資料集.feather')
df = df.loc[df.dt >= start_dt] # 取近期資料(太久的資料可能參考價值不高)
test_data = pd.read_feather('../data/需預測的顧客名單及提交檔案範例.feather')

In [21]:
# 一些函數都放在這裡
def chid2answer(chid,method='median'):
    if method in ['sum','mean','median']:
        a = df.loc[df.chid==chid,['shop_tag','txn_amt']].groupby('shop_tag').agg(method).sort_values(by='txn_amt',ascending=False)
    elif method in 'value_counts':
        a = df.loc[df.chid==chid,'shop_tag'].value_counts().to_frame()
    else:
        raise 'error'
    a['在指認欄位'] = False
    a.loc[list(set(a.index)&set(官方指認欄位)),'在指認欄位'] = True #有交集的部份做個記號
    answer = a[a['在指認欄位']==True].head(3)
    if len(answer) != 0:
        return answer.index.tolist()
    else:
        return []

In [22]:
def predict_function(chid):
    answer = chid2answer(chid) # 根據這個chid找答案但是不一定可以找到3個
    if len(answer) == 3:
        return answer
    else: # 若找不到三個
        idx = df_groupby_chid_preprocessed.loc[df_groupby_chid_preprocessed.chid==chid].index[0] #根據chid取得idx 
        distances, indices = nbrs.kneighbors(X_pca[[idx]]) # 根據idx取得PCA特徵
        chid_list = df_groupby_chid_preprocessed.loc[indices[0][-(nbrs.n_neighbors-1):]]['chid'].values.tolist() # 根據PCA特徵找到鄰居
        for nb_chid in chid_list: #對K個鄰居做遍歷
            nb_answer = chid2answer(nb_chid) # 鄰居的答案
            answer.extend(list(filter(lambda a: a not in answer, nb_answer))) #用鄰居答案對answer做擴充
            if len(answer) >= 3: # 如果補齊三個 return
                return answer[:3]
        remain = 3-len(answer) # 否則算還缺多少
        for _ in range(remain):
            answer.append(np.random.choice(list(set(官方指認欄位)-set(answer))))# 從官方指認欄位隨便補
        return answer

In [23]:
%%time
answer_list = []
for _ in tqdm(range(100)):
    chid = np.random.choice(df['chid'].values)
    answer_list.append(predict_function(chid))
answer_list

100%|██████████| 100/100 [00:02<00:00, 35.17it/s]

Wall time: 2.89 s





[['2', '21', '15'],
 ['2', '12', '6'],
 ['36', '18', '26'],
 ['19', '15', '21'],
 ['25', '48', '15'],
 ['10', '18', '13'],
 ['15', '48', '19'],
 ['37', '15', '21'],
 ['48', '13', '2'],
 ['10', '2', '19'],
 ['48', '19', '6'],
 ['48', '19', '39'],
 ['19', '37', '39'],
 ['2', '37', '13'],
 ['18', '37', '10'],
 ['37', '10', '25'],
 ['10', '19', '36'],
 ['21', '37', '2'],
 ['10', '6', '48'],
 ['2', '15', '19'],
 ['19', '15', '39'],
 ['10', '19', '6'],
 ['39', '10', '2'],
 ['39', '10', '21'],
 ['21', '2', '18'],
 ['26', '6', '10'],
 ['2', '37', '15'],
 ['13', '10', '12'],
 ['19', '39', '2'],
 ['19', '48', '2'],
 ['19', '39', '6'],
 ['26', '19', '36'],
 ['15', '6', '2'],
 ['19', '15', '21'],
 ['12', '10', '2'],
 ['39', '2', '21'],
 ['15', '21', '19'],
 ['21', '10', '48'],
 ['39', '19', '15'],
 ['2', '18', '6'],
 ['48', '39', '10'],
 ['18', '26', '36'],
 ['37', '26', '18'],
 ['26', '12', '2'],
 ['10', '19', '36'],
 ['37', '19', '2'],
 ['25', '19', '12'],
 ['26', '12', '10'],
 ['2', '39', '10']

In [24]:
np.sum([ len(np.unique(i)) for i in answer_list])/len(answer_list)

3.0