# Project NLP features with BERT  
I think if customers bought similar products, products features also are similar. Then I transform 'detail_desc' to distributed representation with BERT and project 2D space after dimension reduction with PCA. 

I use euclidean distance to calculates distance between center point and each product features.

$$d(\mathbf{x}, \mathbf{y}) = \sqrt{(x_1 - y_1)^2 + \cdots + (x_n - y_n)^2}$$

**Result**  
- I project features that a customer bought product on 2D space but features are not similar. In this notebook, I project all product but it could better to divide product with category before project.  
- I implemented calculate all customer features center point on local environment to make submission. Maybe it's just my poor implementation took a very long time(about eight hundreds hour...).

## Import Library  

In [None]:
from typing import List

import numpy as np
import pandas as pd
from scipy.spatial import distance
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import torch
import transformers
from transformers import BertTokenizer

%matplotlib inline

## Load articles data  

In [None]:
articles = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/articles.csv")
articles.head()

Fill NaN and check max length of 'detail desc' for using BERT.

In [None]:
articles['detail_desc'] = articles['detail_desc'].fillna('')
articles['len_desc'] = articles['detail_desc'].apply(lambda x: len(x.split()))

display(articles['len_desc'])
print(f"max describe length: {articles['len_desc'].max()}")
print(f"min describe length: {articles['len_desc'].min()}")

### Helper functions  
Class of BERT vectorizer refer to this article(japanese).  
URL: https://zenn.dev/koukyo1994/articles/9b1da2482d8ba1  

In [None]:
class BertSequenceVectorizer:
    def __init__(self, model_name="bert-base-uncased", max_len=128):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model_name = model_name
        self.tokenizer = BertTokenizer.from_pretrained(self.model_name)
        self.bert_model = transformers.BertModel.from_pretrained(self.model_name)
        self.bert_model = self.bert_model.to(self.device)
        self.max_len = max_len

    def vectorize(self, sentence: str) -> np.array:
        inp = self.tokenizer.encode(sentence)
        len_inp = len(inp)

        if len_inp >= self.max_len:
            inputs = inp[:self.max_len]
            masks = [1] * self.max_len
        else:
            inputs = inp + [0] * (self.max_len - len_inp)
            masks = [1] * len_inp + [0] * (self.max_len - len_inp)

        inputs_tensor = torch.tensor([inputs], dtype=torch.long).to(self.device)
        masks_tensor = torch.tensor([masks], dtype=torch.long).to(self.device)

        bert_out = self.bert_model(inputs_tensor, masks_tensor)
        seq_out, _ = bert_out['last_hidden_state'], bert_out['pooler_output']

        if torch.cuda.is_available():
            return seq_out[0][0].cpu().detach().numpy()  # 0番目は [CLS] token, 768 dim の文章特徴量
        else:
            return seq_out[0][0].detach().numpy()

Transform 'detail desc' to distributed representation and reduction dimensional with PCA.

In [None]:
def tokenize(df: pd.DataFrame, columns: List[str], max_len_list: List[int],
             model_class=BertSequenceVectorizer, num_component=64):
    df = df[["article_id"] + columns]
    token_df = pd.DataFrame({"article_id": df["article_id"]})

    for max_len, column in zip(max_len_list, columns):
        model = model_class(model_name="bert-base-cased", max_len=max_len)
        pca = PCA(n_components=num_component)
        feature_columns = [f"{column}_feature{i}" for i in range(num_component)]
        features = np.stack(df[column].apply(lambda x: model.vectorize(x)).values)

        # reduction dimensional with PCA.
        features = pca.fit_transform(features)
        token_df[feature_columns] = features

    return token_df

In [None]:
# take very long time
tokenize_df = tokenize(articles, columns=['detail_desc'], max_len_list=[128], num_component=2)

articles = pd.concat((articles, tokenize_df.iloc[:, 1:]), axis=1)
articles

## Load transactions data  

In [None]:
transactions = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv")
transactions.head()

Extract a customer id to plot features that a customer bought product.

In [None]:
ex_customer = transactions.loc[0, 'customer_id']
ex_transaction = transactions.query("customer_id == @ex_customer")

In [None]:
plt.figure(figsize=(12, 12))

articles['ex_flag'] = 0

for article_id in ex_transaction['article_id'].values:
    articles.loc[articles['article_id'] == article_id, 'ex_flag'] = 1

plt.scatter(articles.query("ex_flag == 0")['detail_desc_feature0'],
            articles.query("ex_flag == 0")['detail_desc_feature1'],
            label="Others")
plt.scatter(articles.query("ex_flag == 1")['detail_desc_feature0'],
            articles.query("ex_flag == 1")['detail_desc_feature1'],
            label="ex customer bought article")

# 例として得ている客の購入した記事の重心を得る
center_x = articles.query("ex_flag == 1")['detail_desc_feature0'].mean()
center_y = articles.query("ex_flag == 1")['detail_desc_feature1'].mean()
plt.scatter(center_x, center_y, label="center point")

plt.title("article dist")
plt.xlabel("detail desc feature 0")
plt.ylabel("detail desc feature 1")
plt.legend()

plt.show()

In [None]:
def plot_customer_dist(ex_customer):
    plt.figure(figsize=(12, 12))

    ex_transaction = transactions.query("customer_id == @ex_customer")
    articles['ex_flag'] = 0

    for article_id in ex_transaction['article_id'].values:
        articles.loc[articles['article_id'] == article_id, 'ex_flag'] = 1

    plt.scatter(articles.query("ex_flag == 0")['detail_desc_feature0'],
                articles.query("ex_flag == 0")['detail_desc_feature1'],
                label="Others")
    plt.scatter(articles.query("ex_flag == 1")['detail_desc_feature0'],
                articles.query("ex_flag == 1")['detail_desc_feature1'],
                label="ex customer bought article")

    # calculate features center point
    center_x = articles.query("ex_flag == 1")['detail_desc_feature0'].mean()
    center_y = articles.query("ex_flag == 1")['detail_desc_feature1'].mean()
    plt.scatter(center_x, center_y, label="center point")

    plt.title(f"article dist {ex_customer}")
    plt.xlabel("detail desc feature 0")
    plt.ylabel("detail desc feature 1")
    plt.legend()

    plt.show()

Plot features and center point some customers features.  

In [None]:
customer_id_list = transactions['customer_id'].unique()
for ex_customer in customer_id_list[:10]:
    plot_customer_dist(ex_customer)

In [None]:
def no_pca_tokenize(df: pd.DataFrame, columns: List[str], max_len_list: List[int],
                    model_class=BertSequenceVectorizer):
    df = df[["article_id"] + columns]
    token_df = pd.DataFrame({"article_id": df["article_id"]})

    for max_len, column in zip(max_len_list, columns):
        model = model_class(model_name="bert-base-cased", max_len=max_len)
        feature_columns = [f"{column}_feature{i}" for i in range(768)]
        features = np.stack(df[column].apply(lambda x: model.vectorize(x)).values)

        token_df[feature_columns] = features

    return token_df

# take very long time
tokenize_df = no_pca_tokenize(articles, columns=['detail_desc'], max_len_list=[128])

Try to make prediction a customer.  

In [None]:
ex_customer = customer_id_list[0]

articles['ex_flag'] = 0
tokenize_df['ex_flag'] = 0

for article_id in ex_transaction['article_id'].values:
    articles.loc[articles['article_id'] == article_id, 'ex_flag'] = 1
    tokenize_df.loc[articles['article_id'] == article_id, 'ex_flag'] = 1

ex_customer_center = tokenize_df.query("ex_flag == 1").iloc[:, 1:-1].values.mean(axis=0)

distance_df = pd.DataFrame({"article_id": articles['article_id'].values})
distance_df['distance'] = 0

for i in range(tokenize_df.shape[0]):
    features = tokenize_df.iloc[i, 1:-1].values
    _distance = distance.euclidean(ex_customer_center, features)
    distance_df.loc[i, 'distance'] = _distance

distance_df

In [None]:
distance_df = distance_df.sort_values(by="distance")
ex_pred = distance_df.head(12)['article_id'].values

print(ex_pred)