In [None]:
# %pip install -Uq chromadb tqdm

In [1]:
import sys
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import pandas as pd
import pyspark
    
# get the sparkcontext to communicate with the cluster
cf = SparkConf()
cf.set("spark.submit.deployMode","client")
cf.set('spark.sql.repl.eagerEval.enabled', True)

cf.set('spark.driver.memory','16g')
cf.set("spark.executor.instances", "8")
sc = SparkContext.getOrCreate(cf)

spark = SparkSession \
.builder \
.appName("Python Spark SQL basic example") \
.config("spark.some.config.option", "some-value") \
.getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/11 21:46:48 INFO org.apache.spark.SparkEnv: Registering MapOutputTracker
24/05/11 21:46:49 INFO org.apache.spark.SparkEnv: Registering BlockManagerMaster
24/05/11 21:46:49 INFO org.apache.spark.SparkEnv: Registering BlockManagerMasterHeartbeat
24/05/11 21:46:49 INFO org.apache.spark.SparkEnv: Registering OutputCommitCoordinator


In [2]:
spark

In [3]:
anime_data_path = 'animedata/anime-dataset-2023_new.csv'
animedata = spark.read.format("csv").option("header", "true") \
                                   .option("headers", "true") \
                                   .option('escape','"') \
                                   .option("multiLine","true")\
                                   .option("inferSchema", "true") \
                                   .load(anime_data_path, sep=',')

                                                                                

In [4]:
import chromadb.utils.embedding_functions as embedding_functions
huggingface_ef = embedding_functions.HuggingFaceEmbeddingFunction(
    api_key="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", #add your api key
    model_name="mistralai/Mistral-7B-Instruct-v0.2"
)

In [5]:
import chromadb
client = chromadb.Client() 
collection = client.create_collection(name="Anime_Reco",metadata={"hnsw:space": "cosine"},embedding_function=huggingface_ef) # l2 is the default
collection = client.get_collection(name="Anime_Reco")

In [6]:
for row in animedata.collect():
    anime_id = str(row['anime_id'])
    synopsis = row['Synopsis']
    
    # Add document to the collection
    collection.add(ids=anime_id, documents=synopsis)

                                                                                

In [7]:

# Query for recommendations
from pyspark.sql.functions import col

# Select the synopsis texts for querying
query_texts = animedata.select('Synopsis').limit(20).collect()

# Convert the query texts to a list of strings
query_texts_list = [row['Synopsis'] for row in query_texts]

# Query ChromaDB for similar documents
results = collection.query(
    query_texts=query_texts_list,
    n_results=3
)

                                                                                

In [8]:
results

{'ids': [['1', '400', '5'],
  ['5', '1', '44516'],
  ['6', '52093', '4106'],
  ['7', '933', '7419'],
  ['8', '1123', '1821'],
  ['15', '36934', '2116'],
  ['16', '1142', '11321'],
  ['17', '25341', '2454'],
  ['18', '15059', '187'],
  ['19', '3941', '3859'],
  ['20', '55453', '34566'],
  ['21', '36215', '38234'],
  ['22', '995', '2752'],
  ['23', '979', '7793'],
  ['24', '846', '23033'],
  ['25', '45207', '5005'],
  ['26', '32836', '1915'],
  ['27', '270', '777'],
  ['28', '32828', '580'],
  ['29', '3287', '11021']],
 'distances': [[5.960464477539063e-08, 0.4772677421569824, 0.4967891573905945],
  [-2.384185791015625e-07, 0.4967891573905945, 0.5648226141929626],
  [-3.5762786865234375e-07, 0.18922197818756104, 0.22198247909545898],
  [-2.384185791015625e-07, 0.5211420059204102, 0.5507012605667114],
  [1.7881393432617188e-07, 1.7881393432617188e-07, 0.4966338276863098],
  [1.7881393432617188e-07, 0.5436348915100098, 0.5442937016487122],
  [5.960464477539063e-08, 0.2098509669303894, 0.30

In [9]:
from pyspark.sql.functions import col
anime_names = animedata.select('Name', 'anime_id').limit(20)

# Iterate through the anime names and IDs
for i, row in enumerate(anime_names.collect()):
    print(f"Anime Name: {row['Name']}")
    anime_id_1 = int(results['ids'][i][1])
    anime_id_2 = int(results['ids'][i][2])
    
    # Get the recommendation names using anime IDs
    recommendation_name_1 = animedata.filter(col("anime_id") == anime_id_1).select("Name").collect()[0]['Name']
    recommendation_name_2 = animedata.filter(col("anime_id") == anime_id_2).select("Name").collect()[0]['Name']

    # Skip the first recommendation
    print(f"Recommendation: {recommendation_name_1}")
    print(f"Recommendation: {recommendation_name_2}")
    print()

                                                                                

Anime Name: Cowboy Bebop


                                                                                

Recommendation: Seihou Bukyou Outlaw Star
Recommendation: Cowboy Bebop: Tengoku no Tobira

Anime Name: Cowboy Bebop: Tengoku no Tobira


                                                                                

Recommendation: Cowboy Bebop
Recommendation: Koroshi Ai

Anime Name: Trigun
Recommendation: Trigun Stampede
Recommendation: Trigun: Badlands Rumble

Anime Name: Witch Hunter Robin


                                                                                

Recommendation: Dai Mahou Touge
Recommendation: Wrestler Gundan Seisenshi Robin Jr.

Anime Name: Bouken Ou Beet
Recommendation: Bouken Ou Beet Excellion
Recommendation: Guren

Anime Name: Eyeshield 21
Recommendation: Captain Tsubasa (2018)
Recommendation: Captain Tsubasa

Anime Name: Hachimitsu to Clover
Recommendation: Hachimitsu to Clover II
Recommendation: Nee Summer!

Anime Name: Hungry Heart: Wild Striker
Recommendation: Diamond no Ace OVA
Recommendation: Kimagure Orange☆Road: Ano Hi ni Kaeritai

Anime Name: Initial D Fourth Stage
Recommendation: Initial D Fifth Stage
Recommendation: Initial D Third Stage

Anime Name: Monster
Recommendation: Rensa Byoutou
Recommendation: Top Secret: The Revelation

Anime Name: Naruto
Recommendation: Naruto (2023)
Recommendation: Boruto: Naruto Next Generations

Anime Name: One Piece
Recommendation: One Piece: Episode of East Blue - Luffy to 4-nin no Nakama no Daibouken
Recommendation: One Piece Movie 14: Stampede

Anime Name: Tennis no Ouji-sama
R