# 1) Import library

In [None]:
import pandas as pd
import numpy as np
import re
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} device")

# 2) import files

In [None]:
data = pd.read_csv('https://anime-recommendation-engine.s3.eu-west-3.amazonaws.com/data/animes_clean.csv')
data.drop("Unnamed: 0", axis=1, inplace=True)
data.head()

In [None]:
df = data[['uid', 'synopsis']]

In [None]:
df.head()

# 3) preprocess

In [None]:
# remove special character
df['synopsis_clean'] = df['synopsis'].apply(lambda doc : re.sub("[^A-Za-z]+", " ", str(doc)).lower())

In [None]:
# pre-trained model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

In [None]:
# encode and store embeddings in df
df['synopsis_embedding'] = df['synopsis_clean'].apply(lambda x: model.encode(x))

In [None]:
# export for compute saving

df[['uid', 'synopsis_embedding']].to_json('../data/synopsis_embedding.json', index=False)

# 4) app

In [None]:
# load 
data2 = pd.read_json('https://anime-recommendation-engine.s3.eu-west-3.amazonaws.com/data/synopsis_embedding.json')

In [None]:
data2.head()

In [None]:
def search_closest_by_uid(given_uid, df, filter):

        # Find the embedding corresponding to the given UID
        given_embedding = df.loc[df['uid'] == given_uid, filter].values[0]

        # cosine similarity : given embedding VS all embeddings
        similarities = cosine_similarity([given_embedding], list(df[filter]))[0]

        # Store similarity
        similarity_df = pd.DataFrame({'uid': df['uid'], 'similarity': similarities})

        # filter by similarity. given_uid exclude
        closest = similarity_df[similarity_df['uid'] != given_uid].sort_values(by='similarity', ascending=False).head(10)

        return closest

In [None]:

given_uid = 24415
df = data2
filter = 'synopsis_embedding'


# call the function and store in DF
result_df = pd.DataFrame(search_closest_by_uid(given_uid, df, filter), columns=['uid','similarity'])
display(result_df)