In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import plotly.express as px
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import matplotlib.pyplot as plt
import os
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
pd.options.display.float_format = '{:.4f}'.format


def get_row(n_total, n_cols) :
    if n_total % n_cols == 0 :
        n_rows = n_total / n_cols
    else :
        n_rows = (n_total // n_cols) + 1
    return int(n_rows)
def visualize_articles(articles , article_list, n_total , n_cols , figsize=(25,10)) :
    n_rows = get_row(n_total , n_cols)
    f, ax = plt.subplots(n_rows, n_cols, figsize=figsize)
    axes = ax.flatten()
    i = 0
    for article in article_list:
        visualize_article(axes , i , articles , article)
        i += 1
    else :
        plt.show()
        
def visualize_article(axes,i, articles , article) :
    desc = articles[articles['article_id'] == article]['detail_desc'].iloc[0]
    desc_list = desc.split(' ')
    for j, elem in enumerate(desc_list):
        if j > 0 and j % 5 == 0:
            desc_list[j] = desc_list[j] + '\n'
    desc = ' '.join(desc_list)
    try :
        img = mpimg.imread(f'../input/h-and-m-personalized-fashion-recommendations/images/0{str(article)[:2]}/0{int(article)}.jpg')
        axes[i].imshow(img)
    except :
        pass
    axes[i].set_xticks([], [])
    axes[i].set_yticks([], [])
    axes[i].set_title(article)
    axes[i].grid(False)
    axes[i].set_xlabel(desc, fontsize=10)
    
data_submission_path = "/kaggle/input/h-and-m-personalized-fashion-recommendations/sample_submission.csv"
data_article_path = "/kaggle/input/h-and-m-personalized-fashion-recommendations/articles.csv"
data_transaction_path = "/kaggle/input/h-and-m-personalized-fashion-recommendations/transactions_train.csv"
data_customer_path = "/kaggle/input/h-and-m-personalized-fashion-recommendations/customers.csv"


In [None]:
pd.read_csv(data_article_path,nrows=1).T

# Text Preprocessing

In [None]:
import re
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import string 
import numpy as np

def remove_punctuation(text):
    if text != text :
        punctuationfree = ""
    else :
        punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree

def tokenization(text):
    tokens = re.split('W+',text)
    return tokens

def remove_stopwords(text):
    output= [i for i in text if i not in stopwords]
    return output


def stemming(text):
    stem_text = [porter_stemmer.stem(word) for word in text]
    return stem_text

def lemmatizer(text):
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
    return lemm_text

porter_stemmer = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words('english')

In [None]:
articles = pd.read_csv(data_article_path)

In [None]:
articles['detail_desc']= articles['detail_desc'].apply(lambda x:remove_punctuation(x))
articles['detail_desc'].head()

In [None]:
articles['detail_desc']= articles['detail_desc'].apply(lambda x: x.lower())
articles['detail_desc'].head()

In [None]:
articles['detail_desc']= articles['detail_desc'].apply(lambda x: tokenization(x))
articles['detail_desc'].head()

In [None]:
articles['detail_desc']= articles['detail_desc'].apply(lambda x: remove_stopwords(x))
articles['detail_desc'].head()

In [None]:
articles['detail_desc']= articles['detail_desc'].apply(lambda x: stemming(x))
articles['detail_desc'].head()

In [None]:
articles['detail_desc']= articles['detail_desc'].apply(lambda x: lemmatizer(x))
articles['detail_desc'].head()

In [None]:
articles['detail_desc'] = articles['detail_desc'].apply(lambda x : x[0].split(" "))

In [None]:
articles_name = articles.filter(regex="name$|detail")
articles_name.head()

In [None]:
def get_list(x) :
    total_info = []
    for i in x.values.tolist() :
        if isinstance( i , (str,)) :
            total_info.append(i)
        elif isinstance( i , (list,)) :
            total_info.extend(i)
    else :
        return total_info

articles_name_list = articles_name.apply(lambda x : get_list(x), axis=1)

In [None]:
articles_name_list[0]

# Word2Vec

In [None]:
# from gensim.models import Word2Vec
# model = Word2Vec(sentences=articles_name_list, vector_size=50, window=5, min_count=1, workers=4)
# model.save("word2vec.model")
# sims = model.wv.most_similar('Black', topn=10) 
# sims

# Setence-Transformer

In [None]:
!pip install sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

## Embedding

In [None]:
sentence_embeddings  = sbert_model.encode(" ".join(articles_name_list[0]))
sentence_embeddings.shape

In [None]:
article_sentences = [" ".join(article) for article in articles_name_list]

## Save

In [None]:
# result = sbert_model.encode(article_sentences)
# np.save("/kaggle/working/embedding",result)

In [None]:
# np.load("/kaggle/working/embedding.npy").shape

Download

In [None]:
# from IPython.display import FileLink
# import os
# os.chdir(r'/kaggle/working')
# FileLink(r'embedding.npy')

MY GOOGLE DRIVE LINK

https://drive.google.com/file/d/1AAI8Bws_9rustIWPoCvz9I3rC7GLsI4J/view?usp=sharing


## Load

In [None]:
import numpy as np

embedding_path = "../input/article-embedding/embedding.npy"
embedding_vector = np.load(embedding_path)

In [None]:
from tqdm import tqdm

# Cosine Similarity

In [None]:
def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))
from sklearn.metrics.pairwise import cosine_similarity

def get_cosine_similarity(embedding_vector,idx) :
    
    embedding_cosine = cosine_similarity(embedding_vector[[idx],] , embedding_vector)
    embedding_cosine[0,idx]=0
    return embedding_cosine[0]


def get_best_similiarity(embedding_vector,idx, best_n = 3) :
    emb_cosine = get_cosine_similarity(embedding_vector , idx)
    return emb_cosine.argsort()[-best_n:][::-1]
    

In [None]:
articles = pd.read_csv(data_article_path)

In [None]:
check_idx = 0
top_articles = get_best_similiarity(embedding_vector , check_idx , best_n=5)

# Cosine Similarity (top 6)

In [None]:
best_articles = articles[articles.index.isin([check_idx] + top_articles.tolist())]
visualize_articles(articles, best_articles['article_id'].values.tolist() ,n_total = len(best_articles) ,n_cols=6 )

In [None]:
article_candidates = articles.groupby('index_group_name').sample(1).index.tolist()
article_candidates = [58100, 105333, 40738, 23079, 36520]
for check_idx in article_candidates :
    top_articles = get_best_similiarity(embedding_vector , check_idx , best_n=5)
    best_articles = articles[articles.index.isin(top_articles.tolist())]
    print(best_articles["article_id"].tolist())

    criterion = articles.iloc[check_idx,]['article_id']

    visualize_articles(articles, [criterion] + best_articles['article_id'].values.tolist() ,n_total = len(best_articles) ,n_cols=6 )

In [None]:
!pip install umap-learn

# UMAP 

In [None]:
import umap

In [None]:
# mapper = umap.UMAP().fit(embedding_vector)

In [None]:
# import joblib
# filename = 'umap_mapper.sav'
# joblib.dump(mapper, filename)

In [None]:
from IPython.display import FileLink
import os
os.chdir(r'/kaggle/working')
FileLink(r'umap_mapper.sav')

In [None]:
import joblib
filename = '../input/umapmapper/umap_mapper.sav'
mapper = joblib.load(filename)


In [None]:
articles.filter(regex="name$")

In [None]:
import umap.plot

# UMAP Visualization 2D

In [None]:
umap.plot.points(mapper, color_key_cmap='Paired', background='black')
plt.show()

In [None]:
umap.plot.points(mapper, labels=articles.index_group_name, color_key_cmap='Paired', background='black')
plt.show()

In [None]:
umap.plot.points(mapper, labels=articles.index_name, color_key_cmap='Paired', background='black')
plt.show()