In [1]:
import os
import bert
import numpy as np
import pandas as pd
import tensorflow as tf 
import tensorflow.keras.backend as K

from ast import literal_eval
from nltk.corpus import stopwords
from tqdm.notebook import tqdm
from sklearn.feature_extraction import text

tqdm.pandas()

  from pandas import Panel


### data load

In [2]:
train = pd.read_csv('./data/train.csv').fillna('')
train = train[['query', 'product_title', 'median_relevance']]
max_seq_len = 87

### sentences encoding

In [3]:
model_name = 'uncased_L-4_H-256_A-4'
model_dir = '.models\\uncased_L-4_H-256_A-4\\' + model_name
model_ckpt = os.path.join(model_dir, "bert_model.ckpt")
model_config = os.path.join(model_dir, "bert_config.json")

bert.bert_tokenization.validate_case_matches_checkpoint(do_lower_case=True, init_checkpoint=model_ckpt)
vocab_file = os.path.join(model_dir, "vocab.txt")
tokenizer = bert.bert_tokenization.FullTokenizer(vocab_file, do_lower_case=True)

In [4]:
train['query_token'] = train.progress_apply(lambda x: tokenizer.tokenize(x['query']), axis=1)
train['product_title_token'] = train.progress_apply(lambda x: tokenizer.tokenize(x['product_title']), axis=1)
train['token'] = train.progress_apply(lambda x: ["[CLS]"] + x['query_token'] + ["[SEP]"] + x['product_title_token'] + ["[SEP]"], axis=1)
train['token_ids'] = train.progress_apply(lambda x: tokenizer.convert_tokens_to_ids(x['token']), axis=1)

HBox(children=(FloatProgress(value=0.0, max=10158.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10158.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10158.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10158.0), HTML(value='')))




### calculate query idf value

In [5]:
tf_sentences = list(train.apply(lambda x: '[CLS] ' + ' '.join(tokenizer.tokenize("%s %s"%(x['query'], x['product_title']))),  axis=1))

tfv = text.TfidfVectorizer(min_df=7, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
                           ngram_range=(1,1), use_idf=True, smooth_idf=True, sublinear_tf=True).fit(tf_sentences)

feature_names = tfv.get_feature_names()    

In [6]:
def get_ifidf_for_words(text):
    text_order = [tfv.transform([sen]).todense()[0,:].nonzero()[1][0] for sen in text.split(' ')]
    tfidf_matrix= tfv.transform([text]).todense()
    return [tfidf_matrix[0, order] for order in text_order]

In [7]:
query_idf_dict = {}
for query_sentences in tqdm(train['query'].unique()):
    query_idf_dict[query_sentences] = get_ifidf_for_words('[CLS] '+' '.join(tokenizer.tokenize(query_sentences)))
    
train['query_idf'] = train.progress_apply(lambda x: query_idf_dict[x['query']], axis=1)
del query_idf_dict

HBox(children=(FloatProgress(value=0.0, max=261.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10158.0), HTML(value='')))




### calculate query idf softmax

In [8]:
lq = train.query_token.map(len).max()+1
firstk = int(train.product_title_token.map(len).mean())
print(lq, firstk)

8 13


In [9]:
def idf_softmax(x, lq=8):
    output = np.zeros(lq)
    output[:len(x)] = x
    output = np.exp(output) / np.sum(np.exp(output), axis=0)
    return output

In [10]:
train['idf_softmax'] = train.progress_apply(lambda x: idf_softmax(x['query_idf']).tolist(), axis=1)

HBox(children=(FloatProgress(value=0.0, max=10158.0), HTML(value='')))




### bert sim

In [11]:
bert_params = bert.params_from_pretrained_ckpt(model_dir)
bert_layer = bert.BertModelLayer.from_params(bert_params, name="bert")

input_ids = tf.keras.layers.Input(shape=(max_seq_len,), dtype='int32', name="input_ids")
token_type_ids = tf.keras.layers.Input(shape=(max_seq_len,), dtype='int32', name="token_type_ids")

output = bert_layer([input_ids, token_type_ids])

cls_out = tf.keras.layers.Lambda(lambda seq: seq[:, 0, :], name='CLSTokenSliceLayer')(output)
logits = tf.keras.layers.Dense(units=4, activation='softmax')(cls_out)

model = tf.keras.models.Model(inputs=[input_ids, token_type_ids], outputs=logits)
model.build(input_shape=[(None, max_seq_len), (None, max_seq_len)])
model.load_weights("./model_weights/class_weight_cross_entropy_Mini6.h5") 

In [12]:
if tf.keras.backend.learning_phase():
    print('change learning_phase')

inp = model.input
layers_output = {}
for i in range(4):
    layers_output[i] = tf.keras.models.Model([model.input], [model._layers[2]._layers[1]._layers[0][i].output])
#     layers_output[i] = K.function([model.input], [model._layers[2]._layers[1]._layers[0][i].output])

In [13]:
def cos_sim(A, B):
       return np.dot(A, B)/(np.linalg.norm(A)*np.linalg.norm(B))

In [14]:
def _pad(x, max_seq_len=87):
    pad_x = x + [0] * (max_seq_len - len(x))
    return np.array(pad_x).reshape(1, -1)

In [15]:
def each_layer_sim(x, layer):
    sep_idx = x.index(102)
    end_idx = len(x)-1
    token_ids = [0]*(sep_idx+1) + [1]*(end_idx - sep_idx)
    embed = layer.predict([_pad(x), _pad(token_ids)])[0]
    sim_matrix = np.zeros(shape=(lq, firstk))
    for i in range(lq):
        if i < sep_idx:
            for j in range(sep_idx+1, sep_idx+np.sum(token_ids)):
                try:
                    sim_matrix[i][j-sep_idx-1] = cos_sim(embed[i], embed[j])
                except:
                    pass
    return sim_matrix

#### PACRR sim matrix

In [16]:
train['sim_matrix'] = train.progress_apply(lambda x: np.stack([each_layer_sim(x['token_ids'], layers_output[i]) for i in range(4)]).tolist(), axis=1)

HBox(children=(FloatProgress(value=0.0, max=10158.0), HTML(value='')))




#### DRMM sim hist

In [17]:
def integer_indexing_nonzero(arr):
    return arr[np.nonzero(arr)]

def _hist(arr, bins=30):
    hist = np.ones((bins))
    for s in integer_indexing_nonzero(arr):
        idx = 0
        for i in range(bins):
            if s >= (1/(bins))*i:
                idx += 1
        if idx==0:
            hist[idx] += 1
        else:
            hist[idx-1] += 1
    return np.log(hist).tolist()

def DRMM_hist(x):
    hist = []
    for i in range(len(x['query_token'])+1):
        hist.append(_hist(np.array(x['sim_matrix'])[:, i, :]))
    return hist

In [18]:
train['drmm_hist'] = train.progress_apply(lambda x: DRMM_hist(x), axis=1)

HBox(children=(FloatProgress(value=0.0, max=10158.0), HTML(value='')))




### generate_pairwise_dataset

In [19]:
def generate_pairwise_dataset(df):
    columns = ['query', 
               'product_title_P',
               'product_title_N',
               'median_relevance_P',
               'median_relevance_N',
               'drmm_hist_P', 
               'drmm_hist_N', 
               'sim_matrix_P',
               'sim_matrix_N',
               'query_idf_P',
               'idf_softmax_P']
    
    new_df = pd.DataFrame(columns=columns)
    for query in tqdm(df['query'].unique()):
        # 만족도 (4 - 3, 2, 1), (3 - 2, 1), (2 - 1) 6개 쌍으로 진행
        for positive in [4, 3, 2]:
            try:
                P_temp = df[df['query']==query].groupby('median_relevance').get_group(positive)
                for negative in range(positive)[:0:-1]:
                    try:
                        N_temp = df[df['query']==query].groupby('median_relevance').get_group(negative)
                        temp = pd.merge(P_temp, N_temp, how='inner', on='query',  suffixes=('_P', '_N'))[columns]
                        new_df = pd.concat((new_df, temp))
                    except:
                        # 만족도가 없는구간 pass
                        pass
            except:
                # 만족도가 없는구간 pass
                pass
    new_df.rename(columns={'query_idf_P':'query_idf', 'idf_softmax_P':'idf_softmax'}, inplace=True)
    return new_df

In [20]:
new_df = generate_pairwise_dataset(train)
new_df.reset_index(inplace=True, drop=True)

HBox(children=(FloatProgress(value=0.0, max=261.0), HTML(value='')))




### Save DataFrmae

In [21]:
new_df.to_csv('./data/paccr_drmm_bert_all.csv', index=False)

In [22]:
train.to_csv('./data/paccr_drmm_bert_test_all.csv', index=False)

In [23]:
# new_df.to_csv('./data/paccr_drmm_bert.csv', index=False)

In [24]:
# test_df.to_csv('./data/paccr_drmm_bert_test.csv', index=False)