In [1]:
pip install neo4j

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
df = pd.read_csv("AnimeList.csv")
df = df.drop(["title_english", "title_japanese", "title_synonyms", "image_url", "status", "aired", "background", "related", "licensor", "opening_theme", "ending_theme"], axis=1)

df.dropna(inplace=True)
df['genre1'] = df['genre'].str.split(',').str[0]
df['genre2'] = df['genre'].str.split(',').str[1]
df['genre3'] = df['genre'].str.split(',').str[2]
df.drop(['genre'], axis=1, inplace=True)

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [3]:
import random
# reformat the features
def length_of_eposide(length):
    if 'min. per ep.' in length:
        duration = length.replace('min. per ep.', '')
    elif 'min' in length:
        duration = length.replace('min.', '')
    elif 'Unknown' in length:
        duration = '0'
    elif 'sec. per ep.' in length:
        duration = length.replace('sec. per ep.', '')
        duration = int(duration) / 60
    return int(duration)

df['duration'] = df['duration'].apply(length_of_eposide)
df.to_csv("anime.csv")

In [4]:

random.seed(1000)
print("Dataset size after being cleaned: ", df.shape)
df = df.sample(n=150, random_state=100)

Dataset size after being cleaned:  (2672, 22)


In [5]:
df.head(5)

Unnamed: 0,anime_id,title,type,source,episodes,airing,aired_string,duration,rating,score,...,popularity,members,favorites,premiered,broadcast,producer,studio,genre1,genre2,genre3
5006,12119,Natsu-iro Kiseki,TV,Original,12,False,"Apr 6, 2012 to Jun 29, 2012",24,PG-13 - Teens 13 or older,7.04,...,2043,31880,72,Spring 2012,Unknown,"Aniplex, Lantis, Mainichi Broadcasting System,...",Sunrise,School,Slice of Life,Supernatural
1191,527,Pokemon,TV,Game,276,False,"Apr 1, 1997 to Nov 14, 2002",24,PG - Children,7.42,...,214,298374,3737,Spring 1997,Thursdays at 19:00 (JST),"TV Tokyo, TV Tokyo Music, Studio Jack",OLM,Action,Adventure,Comedy
7211,1175,Metal Fighter Miku,TV,Original,13,False,"Jul 8, 1994 to Sep 30, 1994",25,PG-13 - Teens 13 or older,6.1,...,6905,1953,5,Summer 1994,Unknown,"Victor Entertainment, Studio Jack",J.C.Staff,Sci-Fi,Comedy,Sports
7236,1915,Mahou Shoujo Lyrical Nanoha StrikerS,TV,Original,26,False,"Apr 2, 2007 to Sep 24, 2007",25,PG-13 - Teens 13 or older,7.72,...,1766,40381,572,Spring 2007,Unknown,"Geneon Universal Entertainment, A-Line, King R...",Seven Arcs,Action,Magic,Comedy
13540,12883,Tsuritama,TV,Original,12,False,"Apr 13, 2012 to Jun 29, 2012",23,PG-13 - Teens 13 or older,7.79,...,788,109377,1986,Spring 2012,Fridays at 01:15 (JST),"Aniplex, Dentsu, Fuji TV, tsuritama partners, ...",A-1 Pictures,Comedy,Sci-Fi,Slice of Life


In [6]:
list_of_categorical_features = ['source', 'type', 'rating','premiered','studio','genre1','genre2','genre3']
list_of_numeric_features = ['episodes','score','rank','popularity','scored_by','members','favorites','duration']

In [7]:
from sklearn.preprocessing import MinMaxScaler
#normalize the numerical features
min_max_scaler = MinMaxScaler()
df[list_of_numeric_features] = min_max_scaler.fit_transform(df[list_of_numeric_features])

In [8]:
import numpy as np

def combined_distance(anime_a, anime_b, numeric_features, categorical_features, vdm_data):
    # Euclidean distance for numeric features
    euclidean = 0
    for feature in numeric_features:
        euclidean += (anime_a[feature] - anime_b[feature]) ** 2
    euclidean = np.sqrt(euclidean)

    # VDM distance for categorical features
    vdm_distance = 0
    for feature in categorical_features:
        attribute_data = vdm_data[feature]
        category1 = anime_a[feature]
        category2 = anime_b[feature]

        total_difference = 0
        for category in attribute_data:
            probability1 = attribute_data[category].get(category1, 0)
            probability2 = attribute_data[category].get(category2, 0)
            total_difference += (probability1 - probability2) ** 2

        vdm_distance += (total_difference ** 0.5) ** 2

    vdm_distance = np.sqrt(vdm_distance)

    # Combine Euclidean and VDM distances
    combined = (euclidean + vdm_distance) / 2

    return combined

In [9]:
vdm_data = {}

for feature in list_of_categorical_features:
    feature_data = df[feature].value_counts(normalize=True).to_dict()
    vdm_data[feature] = {}
    
    for category1 in feature_data.keys():
        vdm_data[feature][category1] = {}
        for category2 in feature_data.keys():
            if category1 == category2:
                vdm_data[feature][category1][category2] = 0
            else:
                vdm_data[feature][category1][category2] = abs(feature_data[category1] - feature_data[category2])

# Create a similarity matrix
similarity_matrix = np.zeros((len(df), len(df)))
similarity = 0
count = 0
for i in range(len(df)):
    for j in range(len(df)):
        if i != j:
            distance = combined_distance(df.iloc[i], df.iloc[j], list_of_numeric_features,
                                                        list_of_categorical_features, vdm_data)
            similarity_matrix[i][j] = distance
            similarity = similarity+distance
            count = count + 1

In [10]:
import tkinter as tk
from tkinter import ttk
import numpy as np
import pandas as pd
from neo4j import GraphDatabase
from neo4j.exceptions import ServiceUnavailable
from py2neo import Relationship


def get_recommendations():
    # Get the user input from the entry widget
    user_input = user_input_entry.get()

    if not user_input:
        return

    # Pass the user input to the function as a parameter
    process_recommendations(user_input)
    print(f"Processed user input: {user_input}") 

Please make sure to input your neo4j account name and password in the "auth" variable. 

In [11]:
# clear the existing database and graph in neo4j
driver = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "Shenyuxi76!"))
graph = driver.session()

cypher_query = """
    CALL apoc.periodic.commit("
    MATCH (n)
    WITH n LIMIT $limit
    DETACH DELETE n
    RETURN count(*)
    ", {limit: 1000})
            """

# clear the existing database in neo4j before start
results = graph.run(cypher_query)

Please make sure to input your neo4j account name and password in the "auth" variable. 

In [12]:
def initialize_graph():
    try:
        driver = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j","Shenyuxi76!"))
        graph = driver.session()
        
        #create the relationship between animes based on distance
        for index, Anime in df.iterrows():
                    query = """
                    CREATE (a:Anime {
                        id: $anime_id, name: $title, source: $source, type: $type,
                        rating: $rating, premiered: $premiered, studio: $studio,
                        genre1: $genre1, genre2: $genre2, genre3: $genre3, episodes: $episodes,
                        score: $score, rank: $rank, popularity: $popularity,
                        scored_by: $scored_by, members: $members, favorites: $favorites,
                        duration: $duration
                    })
                    """
                    graph.run(query, {
                        'anime_id': Anime['anime_id'],
                        'title': Anime['title'],
                        'source': Anime['source'],
                        'type': Anime['type'],
                        'rating': Anime['rating'],
                        'premiered': Anime['premiered'],
                        'studio': Anime['studio'],
                        'genre1': Anime['genre1'],
                        'genre2': Anime['genre2'],
                        'genre3': Anime['genre3'],
                        'episodes': Anime['episodes'],
                        'score': Anime['score'],
                        'rank': Anime['rank'],
                        'popularity': Anime['popularity'],
                        'scored_by': Anime['scored_by'],
                        'members': Anime['members'],
                        'favorites': Anime['favorites'],
                        'duration': Anime['duration']
                    })

        query = "MATCH (a:Anime) RETURN a"
        anime_nodes = [record["a"] for record in graph.run(query)]

        # Iterate over the similarity matrix and create relationships between the nodes
        for i in range(len(df)):
            for j in range(0, i):
                anime1 = anime_nodes[i]
                anime2 = anime_nodes[j]
                distance = float(similarity_matrix[i][j])

                if distance < similarity/count:
                        query = """
                        MATCH (a1:Anime {name: $anime1_name}), (a2:Anime {name: $anime2_name})
                        CREATE (a1)-[:SIMILAR_TO {distance: $distance}]->(a2)
                            """
                        graph.run(query, {
                            'anime1_name': anime1['name'],
                            'anime2_name': anime2['name'],
                            'distance': distance
                        })

    except ServiceUnavailable as e:
        print(f"Neo4j service unavailable: {e}")

Please make sure to input your neo4j account name and password in the "auth" variable. 

In [13]:
def process_recommendations(animes):
    try:
        driver = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "Shenyuxi76!"))
        graph = driver.session()
        list_of_anime = animes.split(',')
        input_anime_names = list_of_anime

        # Define the parameterized Cypher query
        cypher_query = """
        MATCH (a1:Anime)-[r:SIMILAR_TO]->(a2:Anime)
        WHERE a1.name IN $input_anime_names
        AND NOT a2.name IN $input_anime_names
        WITH a1, a2, r, MIN(r.distance) AS min_distance
        ORDER BY min_distance ASC
        Limit 5
        RETURN a1, r, a2
        """

        # Run the parameterized query with the input anime names as parameters
        results = graph.run(cypher_query, input_anime_names=input_anime_names)

        # Clear the existing recommendations
        recommendation_text.delete(1.0, tk.END)
        
        results_list = list(results)
        # Insert the recommendations into the text box
        if len(results_list) == 0:
            recommendation_text.insert(tk.END, "Sorry, there are no recommended animes for you. We will improve our system to find out your recommendation!")
        else:
            for record in results_list:
                recommendation_text.insert(tk.END, f"Anime: {record['a2']['name']}, Studio: {record['a2']['studio']}\n")
        
        graph.close()
    except ServiceUnavailable as e:
        print(f"Neo4j service unavailable: {e}")

In [None]:
# Create the main window
root = tk.Tk()
root.title("Anime Recommendation System")

# initialize the nodes and edge
initialize_graph()

# Create and pack the widgets
frame = ttk.Frame(root, padding="10")
frame.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S))

# Add a label for the input entry
user_input_label = ttk.Label(frame, text="Enter comma-separated anime names:")
user_input_label.grid(row=0, column=0, sticky=tk.W, pady=(0, 5))

user_input_entry = ttk.Entry(frame, width=80)
user_input_entry.grid(row=1, column=0, pady=(0, 10))

# Change the command of the button to get_recommendations
get_recommendations_button = ttk.Button(frame, text="Get Recommendations", command=get_recommendations)
get_recommendations_button.grid(row=2, column=0, pady=(0, 10))

recommendation_text = tk.Text(frame, wrap=tk.WORD, width=80, height=20)
recommendation_text.grid(row=3, column=0)

scrollbar = ttk.Scrollbar(frame, orient=tk.VERTICAL, command=recommendation_text.yview)
scrollbar.grid(row=3, column=1, sticky=(tk.N, tk.S))
recommendation_text["yscrollcommand"] = scrollbar.set

root.mainloop()

In [None]:
#python anime_recommendation_gui.py