In [17]:
from neo4j import GraphDatabase

# Set your connection URI and authentication based on your Docker Compose setup
URI = "bolt://localhost:7687"
# AUTH = ("neo4j", "neo4j")

# Create the driver object to connect to Neo4j
# driver = GraphDatabase.driver(URI, auth=AUTH)
driver = GraphDatabase.driver(URI)

# You can add a check to make sure the connection is working
try:
    driver.verify_connectivity()
    print("Connection to Neo4j successful!")
except Exception as e:
    print(f"Connection failed: {e}")

Connection to Neo4j successful!


In [23]:
import pandas as pd
from neo4j import GraphDatabase

# --- Configuration for Neo4j Connection ---
URI = "bolt://localhost:7687"
# AUTH = ("neo4j", "your_new_password") # Use your actual new password

# --- Step 1: Define functions for each operation type ---

def create_constraints(tx):
    """
    Creates uniqueness constraints in a separate transaction.
    """
    print("Creating uniqueness constraints...")
    tx.run("CREATE CONSTRAINT IF NOT EXISTS FOR (a:Anime) REQUIRE a.anime_id IS UNIQUE")
    tx.run("CREATE CONSTRAINT IF NOT EXISTS FOR (g:Genre) REQUIRE g.name IS UNIQUE")
    tx.run("CREATE CONSTRAINT IF NOT EXISTS FOR (u:User) REQUIRE u.user_id IS UNIQUE")
    print("Constraints created.")
    print("-" * 20)

def ingest_anime_and_genres(tx, anime_df):
    """
    Ingests Anime and Genre nodes and relationships in batches.
    """
    print("Ingesting Anime and Genre nodes and relationships...")

    # Vectorize and batch the anime data
    anime_records = anime_df[['anime_id', 'name', 'type', 'episodes', 'rating']].to_dict('records')
    tx.run("""
    UNWIND $records AS record
    MERGE (a:Anime {anime_id: record.anime_id})
    ON CREATE SET a.title = record.name, a.type = record.type, a.episodes = record.episodes, a.rating = toFloat(record.rating)
    """, records=anime_records)

    # Batch genre data
    genre_data = []
    for _, row in anime_df.iterrows():
        if isinstance(row['genre'], str):
            genres = [genre.strip() for genre in row['genre'].split(',')]
            for genre_name in genres:
                genre_data.append({'anime_id': row['anime_id'], 'genre_name': genre_name})

    tx.run("""
    UNWIND $genre_data AS data
    MERGE (g:Genre {name: data.genre_name})
    WITH g, data
    MATCH (a:Anime {anime_id: data.anime_id})
    MERGE (a)-[:HAS_GENRE]->(g)
    """, genre_data=genre_data)

    print("Anime and Genre ingestion complete.")
    print("-" * 20)

def ingest_ratings_batched(tx, ratings_df, batch_size=2000):
    """
    Ingests User and Rating relationships using batched transactions.
    """
    print("Ingesting User and Rating relationships in batches...")

    # Filter for valid ratings
    valid_ratings = ratings_df[ratings_df['rating'] != -1]
    # Get a list of all unique users
    unique_users = valid_ratings['user_id'].unique()
    print(f"Total number of unique users: {len(unique_users)}")

    # Randomly select 75% of the unique users
    sampled_users = pd.Series(unique_users).sample(frac=0.75)

    # Filter the original DataFrame to keep only the selected users' records
    reduced_rating_df = valid_ratings[valid_ratings['user_id'].isin(sampled_users)]
    
    # Convert DataFrame to a list of dictionaries for efficient processing
    rating_records = reduced_rating_df.to_dict('records')
    
    rating_query = """
    UNWIND $batch AS record
    MERGE (u:User {user_id: record.user_id})
    WITH u, record
    MATCH (a:Anime {anime_id: record.anime_id})
    MERGE (u)-[r:RATED]->(a)
    ON CREATE SET r.score = toInteger(record.rating)
    """

    for i in range(0, len(rating_records), batch_size):
        batch = rating_records[i:i + batch_size]
        tx.run(rating_query, batch=batch)
        print(f"  Ingested batch {i // batch_size + 1} of {len(rating_records) // batch_size + 1}")
    
    print("User and Rating ingestion complete.")
    print("-" * 20)




# --- Main execution block ---
if __name__ == '__main__':
    # Load and preprocess the data
    print("Loading and preprocessing data...")
    anime_df = pd.read_csv('data/anime.csv')
    rating_df = pd.read_csv('data/rating.csv')
    anime_df.dropna(subset=['genre'], inplace=True)
    print("Data loaded successfully.")
    print("-" * 20)

    # Create the Neo4j driver
    driver = GraphDatabase.driver(URI) #, auth=AUTH)

    # Execute all ingestion steps in separate transactions
    with driver.session() as session:
        session.execute_write(create_constraints)
        session.execute_write(ingest_anime_and_genres, anime_df)
        session.execute_write(ingest_ratings_batched, rating_df)

    # Close the driver connection
    driver.close()
    print("Driver connection closed.")



Loading and preprocessing data...
Data loaded successfully.
--------------------
Creating uniqueness constraints...
Constraints created.
--------------------
Ingesting Anime and Genre nodes and relationships...
Anime and Genre ingestion complete.
--------------------
Ingesting User and Rating relationships in batches...
Total number of unique users: 69600
  Ingested batch 1 of 2387
  Ingested batch 2 of 2387
  Ingested batch 3 of 2387
  Ingested batch 4 of 2387
  Ingested batch 5 of 2387
  Ingested batch 6 of 2387
  Ingested batch 7 of 2387
  Ingested batch 8 of 2387
  Ingested batch 9 of 2387
  Ingested batch 10 of 2387
  Ingested batch 11 of 2387
  Ingested batch 12 of 2387
  Ingested batch 13 of 2387
  Ingested batch 14 of 2387
  Ingested batch 15 of 2387
  Ingested batch 16 of 2387
  Ingested batch 17 of 2387
  Ingested batch 18 of 2387
  Ingested batch 19 of 2387
  Ingested batch 20 of 2387
  Ingested batch 21 of 2387
  Ingested batch 22 of 2387
  Ingested batch 23 of 2387
  Inges

In [26]:

# driver = GraphDatabase.driver(URI, auth=AUTH)

def run_query(query, params=None):
    """Executes a read query and returns the results."""
    with driver.session() as session:
        result = session.run(query, params)
        return [record for record in result]

# --- Test Example 1: Verify an anime and its genres ---
anime_title = "Naruto"
query1 = """
MATCH (a:Anime {title: $title})-[:HAS_GENRE]->(g:Genre)
RETURN a.title, COLLECT(g.name) AS Genres
"""
results1 = run_query(query1, {"title": anime_title})
print(f"--- Test 1: Verify {anime_title} and its genres ---")
if results1:
    record = results1[0]
    print(f"Anime: {record['a.title']}")
    print(f"Genres: {', '.join(record['Genres'])}")
else:
    print(f"No results found for {anime_title}. Check your data or query.")
print("-" * 30)


# --- Test Example 2: Find similar animes by genre (Optimized) ---
target_anime = "Naruto "
query2 = """
MATCH (target:Anime {title: $title})-[:HAS_GENRE]->(g:Genre)<-[:HAS_GENRE]-(similar:Anime)
WHERE target <> similar
RETURN similar.title, g.name AS SharedGenre, similar.rating
ORDER BY similar.rating DESC
LIMIT 5
"""
results2 = run_query(query2, {"title": target_anime})
print(f"--- Test 2: Similar to {target_anime} by genre ---")
if results2:
    for record in results2:
        print(f"Title: {record['similar.title']}, Shared Genre: {record['SharedGenre']}, Rating: {record['similar.rating']}")
else:
    print(f"No similar animes found for {target_anime}.")
print("-" * 30)



# --- Test Example 3: Get average rating for an anime ---
anime_title_rating = "Code Geass: Hangyaku no Lelouch"
query3 = """
MATCH (a:Anime {title: $title})<-[r:RATED]-(u:User)
RETURN a.title, avg(r.score) AS AverageRating, count(u) AS TotalRatings
"""
results3 = run_query(query3, {"title": anime_title_rating})
print(f"--- Test 3: Average rating for {anime_title_rating} ---")
if results3 and results3[0]["TotalRatings"] > 0:
    record = results3[0]
    print(f"Anime: {record['a.title']}")
    print(f"Average Rating: {record['AverageRating']:.2f}")
    print(f"Total Ratings: {record['TotalRatings']}")
else:
    print(f"No ratings found for {anime_title_rating}.")
print("-" * 30)


driver.close()


  with driver.session() as session:


--- Test 1: Verify Naruto and its genres ---
Anime: Naruto
Genres: Action, Shounen, Martial Arts, Super Power, Comedy
------------------------------
--- Test 2: Similar to Naruto  by genre ---
No similar animes found for Naruto .
------------------------------
--- Test 3: Average rating for Code Geass: Hangyaku no Lelouch ---
Anime: Code Geass: Hangyaku no Lelouch
Average Rating: 8.93
Total Ratings: 18205
------------------------------
