# Read Article Data

In [None]:
import cudf

df = cudf.read_csv("../input/h-and-m-personalized-fashion-recommendations/articles.csv")
print(df.shape)
df.head()

# Find Categorical Columns to Represent The Data

In [None]:
ohe_columns = []
total = 0

for col in df.columns:
    if df[col].dtype == "int64" and len(df[col].unique()) <= 500:
        ohe_columns.append(col)
        total += len(df[col].unique())
    
    print(col, df[col].dtype, len(df[col].unique()))
    
    
print("Columns to use:", ohe_columns)

# One-Hot-Encoding

In [None]:
V = cudf.get_dummies(df[ohe_columns], columns=ohe_columns).values
V.shape

# TFIDF for Article Description

In [None]:
from cuml.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(min_df=3)
V_desc = tfidf.fit_transform(df["detail_desc"].fillna("nodesc"))
V_desc.shape

# Represent The Articles as Vectors of size 512

In [None]:
from cuml import TruncatedSVD
import cupy


EMB_SIZE = 512

V = cupy.hstack([V.astype("float32"), V_desc.todense()])

svd = TruncatedSVD(n_components=EMB_SIZE, random_state=0)
svd.fit(V)
print("Explained variance ratio:", svd.explained_variance_ratio_.sum().item())

# Save The Article Embeddings

In [None]:
V = svd.transform(V)
print(V.shape)

cupy.save("articles.npy", V)

# Get similar article examples

In [None]:
from cuml.neighbors import NearestNeighbors


matcher = NearestNeighbors(n_neighbors=2, metric="cosine")
matcher.fit(V)


distances, indices = matcher.kneighbors(V)

d, idx = distances[:, 1], indices[:, 1]  # exclude self-match, only get the best match

In [None]:
sorted_examples = cupy.argsort(d)

def get_example(i):
    index1 = sorted_examples[i]
    index2 = idx[index1]
    
    print("Match score:", cupy.round(1 - d[index1], 2))
    
    return df.iloc[[index1, index2]].to_pandas().T

### A perfect match (probably a duplicate article)

In [None]:
get_example(0)

### An average match

In [None]:
get_example(df.shape[0]//2)