In [1]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from ast import literal_eval
from sklearn.feature_extraction import text
from tqdm.notebook import tqdm

### data load

In [3]:
train = pd.read_csv('./data/preprocess_train.csv').fillna('')
test = pd.read_csv('./data/preprocess_test.csv').fillna('')

In [4]:
test_df = train.copy()

### generate_pairwise_dataset

In [6]:
def generate_pairwise_dataset(df):
    columns = ['query_preprocessed', 
               'product_title_preprocessed_P', 
               'product_title_preprocessed_N', 
               'median_relevance_P', 
               'median_relevance_N']
    
    new_df = pd.DataFrame(columns=columns)
    for query in tqdm(df['query_preprocessed'].unique()):
        for positive in [4, 3, 2]:
            try:
                P_temp = df[df['query_preprocessed']==query].groupby('median_relevance').get_group(positive)
                for negative in range(positive)[:0,-1]:
                    try:
                        N_temp = df[df['query_preprocessed']==query].groupby('median_relevance').get_group(negative)
                        temp = pd.merge(P_temp, N_temp, how='inner', on='query_preprocessed',  suffixes=('_P', '_N'))[columns]
                        new_df = pd.concat((new_df, temp))
                    except:
                        # 만족도가 없는구간 pass
                        pass
            except:
                # 만족도가 없는구간 pass
                pass
    return new_df

In [7]:
new_df = generate_pairwise_dataset(train)

HBox(children=(FloatProgress(value=0.0, max=261.0), HTML(value='')))




### word2vec model train

In [8]:
sentences_train_eda = train_eda.apply(lambda x: ("%s %s %s" % (x['query_preprocessed'],
                                                       x['product_title_preprocessed'],
                                                       x['product_description_preprocessed'])).split(' '),
                              axis=1)
sentences_test = test.apply(lambda x: ("%s %s %s" % (x['query_preprocessed'],
                                                     x['product_title_preprocessed'],
                                                     x['product_description_preprocessed'])).split(' '),
                            axis=1)
sentences_train = train.apply(lambda x: ("%s %s %s" % (x['query_preprocessed'],
                                                     x['product_title_preprocessed'],
                                                     x['product_description_preprocessed'])).split(' '),
                            axis=1)
sentences = pd.concat((sentences_train, sentences_test, sentences_train_eda), axis=0)

model = Word2Vec(sentences=sentences, size=100,window=4, min_count=1, workers=4, iter=100, sg=1)

In [9]:
result  = model.wv.most_similar(positive=['woman', 'king'], negative=['man'])
print("{}: {:.4f}".format(*result[0]))

queen: 0.5155


## DRMM

### generate sim hist for drmm

In [10]:
def q_D_hist(q, D, bins=30):
    D = D.split(' ')
    hist = np.ones((bins))
    for d in D:
        idx = 0
        for i in range(bins):
            if model.wv.similarity(q, d) >= (1/(bins))*i:
                idx += 1
        if idx==0:
            hist[idx] += 1
        else:
            hist[idx-1] += 1
    return list(np.log(hist))

def Q_D_hist(Q, D):
    Q = Q.split(' ')
    hist_list = np.array(np.zeros(len(Q)), dtype=object)
    for i, q in enumerate(Q):
        hist_list[i] = q_D_hist(q, D, bins=30)
    return list(hist_list)

In [11]:
query_sim = {}
for Q in new_df['query_preprocessed'].unique():
    query_sim[Q] = {}

In [13]:
for _, row in tqdm(new_df[['query_preprocessed', 'product_title_preprocessed_P']].drop_duplicates().iterrows()):
    query_sim[row['query_preprocessed']][row['product_title_preprocessed_P']] = Q_D_hist(row['query_preprocessed'], row['product_title_preprocessed_P'])

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [14]:
for _, row in tqdm(new_df[['query_preprocessed', 'product_title_preprocessed_N']].drop_duplicates().iterrows()):
    query_sim[row['query_preprocessed']][row['product_title_preprocessed_N']] = Q_D_hist(row['query_preprocessed'], row['product_title_preprocessed_N'])

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [15]:
tqdm.pandas()

  from pandas import Panel


In [16]:
new_df['positive_hist'] = new_df.progress_apply(lambda x: query_sim[x['query_preprocessed']][x['product_title_preprocessed_P']], axis=1)

HBox(children=(FloatProgress(value=0.0, max=54985.0), HTML(value='')))




In [17]:
new_df['negative_hist'] = new_df.progress_apply(lambda x: query_sim[x['query_preprocessed']][x['product_title_preprocessed_N']], axis=1)

HBox(children=(FloatProgress(value=0.0, max=54985.0), HTML(value='')))




In [43]:
test_df['hist'] = test_df.progress_apply(lambda x: Q_D_hist(x['query_preprocessed'], x['product_title_preprocessed']), axis=1)

HBox(children=(FloatProgress(value=0.0, max=10158.0), HTML(value='')))




### calculate query idf value

In [45]:
stop_words = text.ENGLISH_STOP_WORDS.union(['http', 'www', 'img', 'border', 'color', 'style', 'padding', 'table', 'font', '',
                                            'thi', 'inch', 'ha', 'width', 'height', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9'])
stop_words = text.ENGLISH_STOP_WORDS.union(set(stopwords.words('english')))
tf_sentences = list(train.apply(lambda x: ("%s %s"%(x['query_preprocessed'], x['product_title_preprocessed'])),  axis=1))
tfv = text.TfidfVectorizer(min_df=7, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
                           ngram_range=(1, 3), use_idf=True, smooth_idf=True, sublinear_tf=True,
                           stop_words=stop_words).fit(tf_sentences)

feature_names = tfv.get_feature_names()    

In [46]:
def get_ifidf_for_words(text):
    tfidf_matrix= tfv.transform([text]).todense()
    feature_index = tfidf_matrix[0,:].nonzero()[1]
    tfidf_scores = zip([feature_names[i] for i in feature_index], [tfidf_matrix[0, x] for x in feature_index])
    return dict(tfidf_scores)

In [47]:
query_idf_dict = {}
for query_sentences in tqdm(train['query_preprocessed'].unique()):
    idf_dict = get_ifidf_for_words(query_sentences)
    query_idf_dict[query_sentences] = [idf_dict[query_word] for query_word in query_sentences.split(' ')]

HBox(children=(FloatProgress(value=0.0, max=261.0), HTML(value='')))




In [48]:
new_df['query_idf'] = new_df.progress_apply(lambda x: query_idf_dict[x['query_preprocessed']], axis=1)
new_df['query_len'] = new_df.progress_apply(lambda x: len(x['query_preprocessed'].split(' ')), axis=1)

HBox(children=(FloatProgress(value=0.0, max=54985.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=54985.0), HTML(value='')))




In [49]:
test_df['query_idf'] = test_df.progress_apply(lambda x: query_idf_dict[x['query_preprocessed']], axis=1)
test_df['query_len'] = test_df.progress_apply(lambda x: len(x['query_preprocessed'].split(' ')), axis=1)

HBox(children=(FloatProgress(value=0.0, max=10158.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10158.0), HTML(value='')))




## PACCR

### calculate hyperparameter for paccr

In [51]:
query_sim = {}
for Q in new_df['query_preprocessed'].unique():
    query_sim[Q] = {}

In [52]:
lq = 0
for t in new_df['query_preprocessed'].unique():
    if lq < len(t.split(' ')):
        lq = len(t.split(' '))
lq

6

In [53]:
len_sum = 0
for t in new_df['product_title_preprocessed_P'].unique():
    len_sum += len(t.split(' '))
len_mean = len_sum/len(new_df['product_title_preprocessed_P'].unique())

In [54]:
len_sum = 0
for t in new_df['product_title_preprocessed_N'].unique():
    len_sum += len(t.split(' '))
len_mean += len_sum/len(new_df['product_title_preprocessed_N'].unique())
firstk = int(len_mean/2)
firstk

8

### generate sim matrix for paccr

In [55]:
def q_D_matrix(q, D):
    D = D.split(' ')
    col = [0]*firstk
    for i in range(7):
        try:
            col[i] = model.wv.similarity(q, D[i])
        except:
            col[i] = col[i]
    return list(col)

def Q_D_matrix(Q, D):
    Q = Q.split(' ')
    matrix = [[0]*firstk]*lq
    for i, q in enumerate(Q):
        matrix[i] = q_D_matrix(q, D)
    return list(matrix)

In [56]:
for _, row in tqdm(new_df[['query_preprocessed', 'product_title_preprocessed_P']].drop_duplicates().iterrows()):
    query_sim[row['query_preprocessed']][row['product_title_preprocessed_P']] = Q_D_matrix(row['query_preprocessed'], row['product_title_preprocessed_P'])

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [57]:
for _, row in tqdm(new_df[['query_preprocessed', 'product_title_preprocessed_N']].drop_duplicates().iterrows()):
    query_sim[row['query_preprocessed']][row['product_title_preprocessed_N']] = Q_D_matrix(row['query_preprocessed'], row['product_title_preprocessed_N'])

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [58]:
new_df['negative_sim_matrix'] = new_df.progress_apply(lambda x: query_sim[x['query_preprocessed']][x['product_title_preprocessed_N']], axis=1)
new_df['positive_sim_matrix'] = new_df.progress_apply(lambda x: query_sim[x['query_preprocessed']][x['product_title_preprocessed_P']], axis=1)

HBox(children=(FloatProgress(value=0.0, max=54985.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=54985.0), HTML(value='')))




In [59]:
test_df['sim_matrix'] = test_df.progress_apply(lambda x: Q_D_matrix(x['query_preprocessed'], x['product_title_preprocessed']), axis=1)

HBox(children=(FloatProgress(value=0.0, max=10158.0), HTML(value='')))




### calculate softmax query idf value

In [61]:
def idf_softmax(x, lq=6):
    output = np.zeros(lq)
    output[:len(x)] = x
    output = np.exp(output) / np.sum(np.exp(output), axis=0)
    return list(output)

In [62]:
new_df['idf_softmax'] = new_df.progress_apply(lambda x: idf_softmax(x['query_idf']), axis=1)

HBox(children=(FloatProgress(value=0.0, max=54985.0), HTML(value='')))




In [63]:
test_df['idf_softmax'] = test_df.progress_apply(lambda x: idf_softmax(x['query_idf']), axis=1)

HBox(children=(FloatProgress(value=0.0, max=10158.0), HTML(value='')))




### Save DataFrmae

In [None]:
# new_df.to_csv('./data/paccr_drmm.csv', index=False)

In [None]:
# new_df.to_csv('./data/paccr_drmm_train.csv', index=False)

In [65]:
new_df.to_csv('./data/paccr_drmm_.csv', index=False)

In [66]:
test_df.to_csv('./data/paccr_drmm_test.csv', index=False)