# 1) Import library

In [12]:
import pandas as pd
import numpy as np
import re
from sentence_transformers import SentenceTransformer
from transformers import AutoModel
from sklearn.metrics.pairwise import cosine_similarity

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} device")

Using cuda device


# 2) import files

In [13]:
data = pd.read_csv('https://anime-recommendation-engine.s3.eu-west-3.amazonaws.com/data/animes_clean.csv')
data.head()

Unnamed: 0,uid,title,synopsis,genre,aired,episodes,members,popularity,ranked,score,img_url,link
0,28891,Haikyuu!! Second Season,Following their participation at the Inter-Hig...,"['Comedy', 'Sports', 'Drama', 'School', 'Shoun...","Oct 4, 2015 to Mar 27, 2016",25.0,489888,141,25.0,8.82,https://cdn.myanimelist.net/images/anime/9/766...,https://myanimelist.net/anime/28891/Haikyuu_Se...
1,23273,Shigatsu wa Kimi no Uso,Music accompanies the path of the human metron...,"['Drama', 'Music', 'Romance', 'School', 'Shoun...","Oct 10, 2014 to Mar 20, 2015",22.0,995473,28,24.0,8.83,https://cdn.myanimelist.net/images/anime/3/671...,https://myanimelist.net/anime/23273/Shigatsu_w...
2,34599,Made in Abyss,The Abyss—a gaping chasm stretching down into ...,"['Sci-Fi', 'Adventure', 'Mystery', 'Drama', 'F...","Jul 7, 2017 to Sep 29, 2017",13.0,581663,98,23.0,8.83,https://cdn.myanimelist.net/images/anime/6/867...,https://myanimelist.net/anime/34599/Made_in_Abyss
3,5114,Fullmetal Alchemist: Brotherhood,"""In order for something to be obtained, someth...","['Action', 'Military', 'Adventure', 'Comedy', ...","Apr 5, 2009 to Jul 4, 2010",64.0,1615084,4,1.0,9.23,https://cdn.myanimelist.net/images/anime/1223/...,https://myanimelist.net/anime/5114/Fullmetal_A...
4,31758,Kizumonogatari III: Reiketsu-hen,After helping revive the legendary vampire Kis...,"['Action', 'Mystery', 'Supernatural', 'Vampire']","Jan 6, 2017",1.0,214621,502,22.0,8.83,https://cdn.myanimelist.net/images/anime/3/815...,https://myanimelist.net/anime/31758/Kizumonoga...


In [14]:
df = data[['uid', 'synopsis']]

In [15]:
df.head()

Unnamed: 0,uid,synopsis
0,28891,Following their participation at the Inter-Hig...
1,23273,Music accompanies the path of the human metron...
2,34599,The Abyss—a gaping chasm stretching down into ...
3,5114,"""In order for something to be obtained, someth..."
4,31758,After helping revive the legendary vampire Kis...


# 3) preprocess

In [16]:
# remove special character
df['synopsis_clean'] = df['synopsis'].apply(lambda doc : re.sub("[^A-Za-z]+", " ", str(doc)).lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['synopsis_clean'] = df['synopsis'].apply(lambda doc : re.sub("[^A-Za-z]+", " ", str(doc)).lower())


# 4) Embedding

# MiniLM-L6-v2  (1min53)

In [17]:
# pre-trained model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# encode and store embeddings in df
df['synopsis_embedding'] = df['synopsis_clean'].apply(lambda x: model.encode(x))

# export for compute saving

df[['uid', 'synopsis_embedding']].to_json('../data/synopsis_embedding_MiniLM.json', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['synopsis_embedding'] = df['synopsis_clean'].apply(lambda x: model.encode(x))


# jina emb = 1024    (12min45 with cuda)

In [None]:
# Initialize the model
model = AutoModel.from_pretrained("jinaai/jina-embeddings-v3", trust_remote_code=True).to('cuda')

# Function to get embeddings
def get_embeddings(text):
    embeddings = model.encode([text], task="text-matching")
    return embeddings[0]

# Apply the function to each row and store the embeddings in a new column
df['synopsis_embedding'] = df['synopsis_clean'].apply(get_embeddings)

# export for compute saving
df[['uid', 'synopsis_embedding']].to_json('../data/synopsis_embedding_jina_1024.json', index=False)

# jina emb = 512  (13min24 with cuda)

In [None]:
# Initialize the model
model = AutoModel.from_pretrained("jinaai/jina-embeddings-v3", trust_remote_code=True, truncate_dim=512).to('cuda')

# Function to get embeddings
def get_embeddings(text):
    embeddings = model.encode([text], task="text-matching")
    return embeddings[0]

# Apply the function to each row and store the embeddings in a new column
df['synopsis_embedding'] = df['synopsis_clean'].apply(get_embeddings)

# export for compute saving
df[['uid', 'synopsis_embedding']].to_json('../data/synopsis_embedding_jina_512.json', index=False)

# 4) app

In [None]:
# load MiniLM
df_MiniLM = pd.read_json('https://anime-recommendation-engine.s3.eu-west-3.amazonaws.com/data/synopsis_embedding_MiniLM.json')

# load jina_1024
df_jina_1024= pd.read_json('https://anime-recommendation-engine.s3.eu-west-3.amazonaws.com/data/synopsis_embedding_jina_1024.json')

# load jina_512
df_jina_512 = pd.read_json('https://anime-recommendation-engine.s3.eu-west-3.amazonaws.com/data/synopsis_embedding_jina_512.json')

In [20]:
def search_closest_by_uid(given_uid, df, filter):

        # Find the embedding corresponding to the given UID
        given_embedding = df.loc[df['uid'] == given_uid, filter].values[0]

        # cosine similarity : given embedding VS all embeddings
        similarities = cosine_similarity([given_embedding], list(df[filter]))[0]

        # Store similarity
        similarity_df = pd.DataFrame({'uid': df['uid'], 'similarity': similarities})

        # filter by similarity. given_uid exclude
        closest = similarity_df[similarity_df['uid'] != given_uid].sort_values(by='similarity', ascending=False).head(5)

        return closest

In [None]:

given_uid = 28891
filter = 'synopsis_embedding'

# call the function and store in DF
result_df = pd.DataFrame(search_closest_by_uid(given_uid, df_MiniLM, filter), columns=['uid','similarity'])
print('MiniLM:')
display(result_df)


# call the function and store in DF
result_df = pd.DataFrame(search_closest_by_uid(given_uid, df_jina_1024, filter), columns=['uid','similarity'])
print('jina_1024:')
display(result_df)

# call the function and store in DF
result_df = pd.DataFrame(search_closest_by_uid(given_uid, df_jina_512, filter), columns=['uid','similarity'])
print('jina_512:')
display(result_df)


MiniLM:


Unnamed: 0,uid,similarity
10,32935,0.798315
4134,33031,0.723976
13914,40679,0.7051
735,20583,0.670796
15449,37379,0.642352


jina_512:


Unnamed: 0,uid,similarity
10,32935,0.753521
16149,16051,0.684005
648,16894,0.675362
11122,29603,0.645879
13914,40679,0.64423


jina_1024:


Unnamed: 0,uid,similarity
10,32935,0.74979
16149,16051,0.682818
648,16894,0.670774
11122,29603,0.638269
13914,40679,0.636505
