## 1) Import library

In [127]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_mistralai import ChatMistralAI
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel
from typing import Optional

import pandas as pd
import re
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

Load files

In [2]:
# load Animes
df_animes = pd.read_csv("https://anime-recommendation-engine.s3.eu-west-3.amazonaws.com/data/animes_clean.csv")

In [102]:
# load synopsis
df_synopsis_embedding = pd.read_json('https://anime-recommendation-engine.s3.eu-west-3.amazonaws.com/data/synopsis_embedding.json')

In [151]:
df_synopsis_embedding_with_genre = df_synopsis_embedding.merge(df_animes[["uid", "genre"]], on="uid", how="inner")
df_synopsis_embedding_with_genre

Unnamed: 0,uid,synopsis_embedding,genre
0,28891,"[-0.0668233037, 0.0767305419, 0.0433372557, -0...","['Comedy', 'Sports', 'Drama', 'School', 'Shoun..."
1,23273,"[-0.0701287091, -0.034392152, 0.0321356915, -0...","['Drama', 'Music', 'Romance', 'School', 'Shoun..."
2,34599,"[-0.0709408671, 0.035861913100000004, 0.012324...","['Sci-Fi', 'Adventure', 'Mystery', 'Drama', 'F..."
3,5114,"[-0.1376934946, 0.07189235840000001, -0.014974...","['Action', 'Military', 'Adventure', 'Comedy', ..."
4,31758,"[-0.10980290170000001, 0.0310311895, -0.072967...","['Action', 'Mystery', 'Supernatural', 'Vampire']"
...,...,...,...
16211,10075,"[-0.0983884782, 0.0313754827, 0.0304597318, -0...","['Action', 'Comedy', 'Super Power', 'Martial A..."
16212,35828,"[-0.1292088181, 0.0889061615, 0.03028058450000...","['Slice of Life', 'Comedy', 'Supernatural']"
16213,10378,"[-0.0585749559, -0.0027970744000000002, 0.1407...","['Slice of Life', 'Comedy', 'Shounen']"
16214,33082,"[-0.0837064683, 0.0939229131, -0.0272170231, -...",['Action']


## 2) LLM - ChatPromptTemplate - Positive and Negative element separator

In [137]:
# Output format
class OutputSchema(BaseModel):
    positive: str
    negative: str
    title: Optional[str]

parser = PydanticOutputParser(pydantic_object=OutputSchema)
format_instructions = parser.get_format_instructions()


# Prompt string 
sys_prompt="""
You are a positive and negative element extractor.

Analyze the user's sentence and extract:
- what the user wants (positive),
- what the user explicitly wants to avoid (negative).

If the user mentions a well-known title (such as an anime, movie, game, etc.) in what they want to avoid, extract it separately.

Return your response as a JSON object with three fields:
- positive: a single string summarizing with key-words what the user wants.
- negative: a single string summarizing with key-words what the user wants to avoid.
- title: the name of the title the user wants to avoid, if any (e.g., an anime, show, movie); return `null` if none is found.

{{format_instructions}}
"""

# Define system prompt
start_prompt = ChatPromptTemplate.from_messages([
    ("system", sys_prompt),
    ("user", "{text}")
])

In [None]:
# Don't forget your API Key
%env MISTRAL_API_KEY=

In [138]:
# Let's instanciate a model 
llm = ChatMistralAI(model="mistral-medium-latest")

In [139]:
model_llm = start_prompt | llm 

## 3) Input

In [153]:
input = "I'm looking for a pirate story with crazy adventures, but not One Piece"
#input = "I want a sport anime. I love Basketball but I hate Volleyball."
#input = "i want a  story centers around dogs but not romance"
#input = "I'm looking a pirate story with crazy adventures, but not a perverted pirate"
#input = "I'm in the mood for a silly, over-the-top comedy with ridiculous characters. I don't want anything serious or emotional"
#input = "I don't want anime with romance, but I like battles"
#input = "Looking for sci-fi anime without horror"
#input = "I want a sport anime."
#input = "I hate Volleyball."

In [154]:
input_clean = re.sub("[^A-Za-z]+", " ", str(input)).lower()

In [155]:
# Get the response 
response = model_llm.invoke({"text": input_clean, "format_instructions": parser.get_format_instructions()})
input_positive_clean = parser.parse(response.content).positive
input_negative_clean = parser.parse(response.content).negative
input_title_clean = parser.parse(response.content).title

if input_positive_clean:
    print(input_positive_clean)
    print(input_negative_clean)
    print(input_title_clean)
else:
    print("Please try again with a different phrasing.")

pirate story with crazy adventures
not one piece
One Piece


## 4) Search similarity

In [110]:
def search_closest_by_content(content, df, filter):

        # cosine similarity : given embedding VS all embeddings
        similarities = cosine_similarity([content], list(df[filter]))[0]

        # Store similarity
        similarity_df = pd.DataFrame({'uid': df['uid'], 'similarity': similarities})

        # filter by similarity. given_uid exclude
        closest = similarity_df.sort_values(by='similarity', ascending=False).head(20)

        return closest

In [161]:
def search_closest_by_content_exclude_category(content, df, filter, col_category, category):

        # mask category
        if col_category:
            mask = df[col_category].apply(lambda lst: category in lst)
            df = df[~mask]

        # cosine similarity : given embedding VS all embeddings
        similarities = cosine_similarity([content], list(df[filter]))[0]

        # Store similarity
        similarity_df = pd.DataFrame({'uid': df['uid'], 'similarity': similarities})

        # filter by similarity. given_uid exclude
        closest = similarity_df.sort_values(by='similarity', ascending=False).head(20)

        return closest

In [111]:
# pre-trained model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

In [163]:
filter = 'synopsis_embedding'
result_df_negative = pd.DataFrame(columns=["uid", "similarity"])
 
# Positive
input_positive_embedding = model.encode(input_positive_clean)
#result_df_positive = pd.DataFrame(search_closest_by_content(input_positive_embedding, df_synopsis_embedding, filter), columns=['uid','similarity'])
result_df_positive = pd.DataFrame(search_closest_by_content_exclude_category(input_positive_embedding, df_synopsis_embedding_with_genre, filter, "genre", "Hentai"), columns=['uid','similarity'])

# Negative
if input_negative_clean:
    input_negative_embedding = model.encode(input_negative_clean)
    #result_df_negative = pd.DataFrame(search_closest_by_content(input_negative_embedding, df_synopsis_embedding, filter), columns=['uid','similarity'])
    result_df_negative = pd.DataFrame(search_closest_by_content_exclude_category(input_negative_embedding, df_synopsis_embedding_with_genre, filter, "genre", "Hentai"), columns=['uid','similarity'])
    
# Titre
if input_title_clean:
    mask = df_animes["title"].str.lower().apply(lambda title: any(mot in title for mot in input_negative_clean.lower().split()))
    result_df_title_negative = df_animes[mask]
    result_df_negative = pd.concat([result_df_negative, result_df_title_negative], ignore_index=True)

## 5) Results

In [165]:
#  Exclusion des négatifs

# We exclude the anime the user doesn't want from those they do want
result_df_final = result_df_positive[~result_df_positive['uid'].isin(result_df_negative['uid'])]
result_df_final = result_df_final.sort_values(by='similarity', ascending=False)

In [166]:
display(result_df_final.head(5))
print(len(result_df_final))

Unnamed: 0,uid,similarity
16175,8917,0.560577
3070,2664,0.553575
14242,3842,0.520571
10193,19505,0.511247
6748,7786,0.492574


12


In [167]:
with pd.option_context('display.max_colwidth', 150):
    display(df_animes[df_animes['uid'].isin(result_df_final['uid'])])

Unnamed: 0,uid,title,synopsis,genre,aired,episodes,members,popularity,ranked,score,img_url,link
3070,2664,Doraemon Movie 19: Nobita no Nankai Daibouken,"Finding a treasure is always been so hard! But, nothing is ever impossible for Doraemon and his magic tool. And so, with the help of Doraemon, Nob...","['Adventure', 'Comedy', 'Fantasy', 'Kids', 'Sci-Fi', 'Shounen']","Mar 7, 1998",1.0,2708,7228,3125.0,7.16,https://cdn.myanimelist.net/images/anime/2/72424.jpg,https://myanimelist.net/anime/2664/Doraemon_Movie_19__Nobita_no_Nankai_Daibouken
5312,11009,Pokemon 3D Adventure 2: Pikachu no Kaitei Daibouken,"When Pikachu and friends decide to stop by a tropical island for some rest and relaxation, they got more than they had bargained for after Chatot ...","['Adventure', 'Comedy', 'Fantasy', 'Kids']","May 20, 2006",1.0,7358,4910,3610.0,7.05,https://cdn.myanimelist.net/images/anime/4/30543.jpg,https://myanimelist.net/anime/11009/Pokemon_3D_Adventure_2__Pikachu_no_Kaitei_Daibouken
5420,4646,Cobra The Animation: The Psycho-Gun,"When Utopia More discovers an ancient record that holds the key to unlocking the secrets of the universe, she becomes the target of the ruthless G...","['Action', 'Adventure', 'Mecha', 'Sci-Fi', 'Space']","Aug 29, 2008 to Feb 27, 2009",4.0,4046,6249,3752.0,7.01,https://cdn.myanimelist.net/images/anime/1592/96818.jpg,https://myanimelist.net/anime/4646/Cobra_The_Animation__The_Psycho-Gun
6312,1638,Peter Pan no Bouken,"Wendy and her two little brothers are brought to the land of adventures, Neverland, by Peter pan, a boy who will never grow up. In Neverland they ...","['Adventure', 'Fantasy']","Jan 15, 1989 to Dec 24, 1989",41.0,3110,6927,4386.0,6.85,https://cdn.myanimelist.net/images/anime/13/40391.jpg,https://myanimelist.net/anime/1638/Peter_Pan_no_Bouken
6748,7786,Arabian Nights: Sindbad no Bouken,"This is a work by Toei Animation Co., Ltd. with screenplay by Tezuka Osamu in cooperation with novelist Kita Morio. Sinbad and a boy, Ali, are sto...","['Action', 'Adventure', 'Fantasy']","Jun 16, 1962",1.0,1007,9488,9709.0,5.56,https://cdn.myanimelist.net/images/anime/1176/100506.jpg,https://myanimelist.net/anime/7786/Arabian_Nights__Sindbad_no_Bouken
9138,29836,Akuei to Gacchinpo,Surreal gag-packed story about curious kid and his companions.,"['Kids', 'Comedy']","Oct 2, 2004 to Dec 25, 2004",13.0,191,13544,11514.0,5.18,https://cdn.myanimelist.net/images/anime/8/71645.jpg,https://myanimelist.net/anime/29836/Akuei_to_Gacchinpo
10193,19505,Kaizoku Ouji,"Kid was brought up on a small island which floats on the Carribean Sea. He lives a pleasant life with many animals as his companions. However, thi...","['Adventure', 'Shounen']","May 2, 1966 to Nov 28, 1966",31.0,287,12272,12555.0,6.13,https://cdn.myanimelist.net/images/anime/1677/100901.jpg,https://myanimelist.net/anime/19505/Kaizoku_Ouji
13350,1443,Sol Bianca,"Five female pirates pilot the Sol Bianca, a starship with a higher level of technology than any other known. With it, they seek out riches, such a...","['Action', 'Sci-Fi', 'Adventure', 'Space']","Mar 21, 1990 to Jul 21, 1991",2.0,4130,6199,6619.0,6.39,https://cdn.myanimelist.net/images/anime/9/49503.jpg,https://myanimelist.net/anime/1443/Sol_Bianca
14242,3842,Doubutsu Takarajima,"Jim is working as an innkeeper, together with his friend, the mouse Gran. One day, a mysterious man visits the inn, leaving a map behind. The map ...",['Adventure'],"Mar 20, 1971",1.0,1438,8719,6020.0,6.51,https://cdn.myanimelist.net/images/anime/4/6860.jpg,https://myanimelist.net/anime/3842/Doubutsu_Takarajima
14799,2825,Arabian Nights: Sindbad no Bouken (TV),"Based on the famous tale from The Thousand and One Arabian Nights, this story takes place more than 1200 years ago. The story's hero is the impish...","['Adventure', 'Fantasy', 'Magic', 'Romance']","Oct 1, 1975 to Sep 29, 1976",52.0,4355,6073,2729.0,7.25,https://cdn.myanimelist.net/images/anime/2/31935.jpg,https://myanimelist.net/anime/2825/Arabian_Nights__Sindbad_no_Bouken_TV
