In [1]:
import random
import pandas as pd
from faker import Faker

In [2]:
data_path = "./data/"

In [3]:
fake = Faker()

In [4]:
# Predefined partial genre list for variety, add more if you like
GENRES = [
    {"name": "Action", "description": "High-intensity, fast-paced movies"},
    {"name": "Drama", "description": "Serious, narrative-driven stories"},
    {"name": "Comedy", "description": "Light-hearted, humorous content"},
    {"name": "Thriller", "description": "Suspenseful, tension-building plots"},
    {"name": "Romance", "description": "Love-focused, emotional tales"},
    {"name": "Horror", "description": "Scary and fear-inducing themes"},
    {"name": "Sci-Fi", "description": "Futuristic or science-based concepts"},
    {"name": "Documentary", "description": "Non-fiction, informational film"},
    {"name": "Animation", "description": "Animated content for all ages"},
    {"name": "Fantasy", "description": "Magical and imaginative storytelling"},
    {"name": "Crime", "description": "Focus on criminal acts and detection"},
    {"name": "Musical", "description": "Song and dance-driven narratives"},
    {"name": "Adventure", "description": "Exploration and quests"},
    {"name": "Mystery", "description": "Investigation and revealing secrets"},
    {"name": "Western", "description": "American Frontier, cowboy themes"},
    {"name": "Film Noir", "description": "Dark, cynical crime dramas"},
    {"name": "Biopic", "description": "Based on a real person’s life"},
    {"name": "Family", "description": "Suitable for all ages"},
    {"name": "History", "description": "Historical events and figures"},
    {"name": "War", "description": "Stories focused on warfare and conflict"}
]

In [5]:
# We decide how many of each entity to create:
num_movies = 500
num_actors = 75
num_directors = 35
num_characters = 750  # Some extra to let multiple actors share or have unique characters

In [6]:
# 1. Generate Directors
directors_list = []
for i in range(num_directors):
    directors_list.append({
        "id": f"director_{i}",
        "name": fake.name(),
        "birthDate": fake.date_of_birth().isoformat()
    })
directors = pd.DataFrame(directors_list)

In [7]:
# 2. Generate Actors
actors_list = []
for i in range(num_actors):
    actors_list.append({
        "id": f"actor_{i}",
        "name": fake.name(),
        "birthDate": fake.date_of_birth().isoformat()
    })
actors = pd.DataFrame(actors_list)

In [8]:
# 3. Generate Characters
characters_list = []
for i in range(num_characters):
    characters_list.append({
        "id": f"char_{i}",
        "name": fake.first_name()  # or full name, but let's just do first name for "character"
    })
characters = pd.DataFrame(characters_list)

In [9]:
# 4. Generate Movies
movies_list = []
for i in range(num_movies):
    # Randomly choose a director for the movie
    director = random.choice(directors_list)
    # Randomly choose some actors (2 to 7 actors)
    movie_actors = random.sample(actors_list, k=random.randint(2, 7))
    # For each actor in that set, randomly choose or create a character
    # We'll pick from the characters list for simplicity
    assigned_characters = random.sample(characters_list, k=len(movie_actors))

    # Create relationships: (actor -> character)
    actor_character_pairs = []
    for actor_obj, char_obj in zip(movie_actors, assigned_characters):
        actor_character_pairs.append({
            "actorID": actor_obj["id"],
            "characterID": char_obj["id"]
        })

    # Randomly assign 1 or 2 or 3 genres
    movie_genres = random.sample(GENRES, k=random.randint(1, 3))

    movies_list.append({
        "id": f"movie_{i}",
        "title": f"Movie {i} - {fake.catch_phrase()}",
        "releaseYear": random.randint(1980, 2023),
        "duration": random.randint(80, 180),
        "rating": round(random.uniform(4.0, 9.9), 1),
        "directorID": director["id"],
        "actorCharacterPairs": actor_character_pairs,
        "genres": movie_genres
    })
movies = pd.DataFrame(movies_list)

In [10]:
# Now we have four lists: directors, actors, characters, movies
# This is our 'fake' dataset that hydrates the ontology.

In [11]:
# For demonstration, let's print out a few example entries
print("Sample Director:", directors_list[0])
print("Sample Actor:", actors_list[0])
print("Sample Character:", characters_list[0])
print("Sample Movie:", movies_list[0])

Sample Director: {'id': 'director_0', 'name': 'Grant Kennedy', 'birthDate': '2007-09-18'}
Sample Actor: {'id': 'actor_0', 'name': 'Mrs. Lisa Donovan', 'birthDate': '1979-09-27'}
Sample Character: {'id': 'char_0', 'name': 'Kayla'}
Sample Movie: {'id': 'movie_0', 'title': 'Movie 0 - Diverse asymmetric adapter', 'releaseYear': 2014, 'duration': 127, 'rating': 8.9, 'directorID': 'director_3', 'actorCharacterPairs': [{'actorID': 'actor_19', 'characterID': 'char_646'}, {'actorID': 'actor_63', 'characterID': 'char_523'}, {'actorID': 'actor_56', 'characterID': 'char_583'}, {'actorID': 'actor_51', 'characterID': 'char_250'}], 'genres': [{'name': 'Drama', 'description': 'Serious, narrative-driven stories'}, {'name': 'Fantasy', 'description': 'Magical and imaginative storytelling'}, {'name': 'Musical', 'description': 'Song and dance-driven narratives'}]}


In [12]:
directors.head()

Unnamed: 0,id,name,birthDate
0,director_0,Grant Kennedy,2007-09-18
1,director_1,Brittany Hutchinson,1988-04-27
2,director_2,Chad Dawson,1969-08-18
3,director_3,Michael Pittman,2020-08-29
4,director_4,Mary Maxwell,2018-12-17


In [13]:
actors.head()

Unnamed: 0,id,name,birthDate
0,actor_0,Mrs. Lisa Donovan,1979-09-27
1,actor_1,Heidi White,1980-02-28
2,actor_2,Shelby Huber,1997-03-29
3,actor_3,Michelle Lin DDS,2019-11-05
4,actor_4,Amanda Howard,1925-09-22


In [14]:
characters.head()

Unnamed: 0,id,name
0,char_0,Kayla
1,char_1,Ashley
2,char_2,Mckenzie
3,char_3,Rachel
4,char_4,Ana


In [15]:
movies.head()

Unnamed: 0,id,title,releaseYear,duration,rating,directorID,actorCharacterPairs,genres
0,movie_0,Movie 0 - Diverse asymmetric adapter,2014,127,8.9,director_3,"[{'actorID': 'actor_19', 'characterID': 'char_...","[{'name': 'Drama', 'description': 'Serious, na..."
1,movie_1,Movie 1 - Polarized client-driven capacity,2018,126,9.7,director_29,"[{'actorID': 'actor_0', 'characterID': 'char_1...","[{'name': 'Documentary', 'description': 'Non-f..."
2,movie_2,Movie 2 - Balanced 5thgeneration extranet,2004,165,8.6,director_22,"[{'actorID': 'actor_19', 'characterID': 'char_...","[{'name': 'Thriller', 'description': 'Suspense..."
3,movie_3,Movie 3 - Stand-alone heuristic service-desk,2018,113,5.2,director_23,"[{'actorID': 'actor_64', 'characterID': 'char_...","[{'name': 'Action', 'description': 'High-inten..."
4,movie_4,Movie 4 - Horizontal 5thgeneration access,2016,167,5.5,director_30,"[{'actorID': 'actor_46', 'characterID': 'char_...","[{'name': 'Animation', 'description': 'Animate..."


In [16]:
# persist the data
directors.to_csv(data_path+"directors.csv", encoding = "utf-8", escapechar = "\\", index=False)
actors.to_csv(data_path+"actors.csv", encoding = "utf-8", escapechar = "\\", index=False)
characters.to_csv(data_path+"characters.csv", encoding = "utf-8", escapechar = "\\", index=False)
movies.to_csv(data_path+"movies.csv", encoding = "utf-8", escapechar = "\\", index=False)