In [1]:
import random
import pandas as pd
from faker import Faker

In [2]:
data_path = "./data/"

In [3]:
fake = Faker()

In [4]:
# Predefined genre list for variety
GENRES = ["Action", "Comedy", "Drama", "Thriller", "Sci-Fi", "Horror", "Romance", "Documentary"]

In [5]:
# We decide how many of each entity to create:
num_movies = 500
num_actors = 75
num_directors = 35
num_characters = 750  # Some extra to let multiple actors share or have unique characters

In [6]:
# 1. Generate Directors
directors_list = []
for i in range(num_directors):
    directors_list.append({
        "id": f"director_{i}",
        "name": fake.name(),
        "birthDate": fake.date_of_birth().isoformat()
    })
directors = pd.DataFrame(directors_list)

In [7]:
# 2. Generate Actors
actors_list = []
for i in range(num_actors):
    actors_list.append({
        "id": f"actor_{i}",
        "name": fake.name(),
        "birthDate": fake.date_of_birth().isoformat()
    })
actors = pd.DataFrame(actors_list)

In [8]:
# 3. Generate Characters
characters_list = []
for i in range(num_characters):
    characters_list.append({
        "id": f"char_{i}",
        "name": fake.first_name()  # or full name, but let's just do first name for "character"
    })
characters = pd.DataFrame(characters_list)

In [9]:
# 4. Generate Movies
movies_list = []
for i in range(num_movies):
    # Randomly choose a director for the movie
    director = random.choice(directors_list)
    # Randomly choose some actors (2 to 7 actors)
    movie_actors = random.sample(actors_list, k=random.randint(2, 7))
    # For each actor in that set, randomly choose or create a character
    # We'll pick from the characters list for simplicity
    assigned_characters = random.sample(characters_list, k=len(movie_actors))

    # Create relationships: (actor -> character)
    actor_character_pairs = []
    for actor_obj, char_obj in zip(movie_actors, assigned_characters):
        actor_character_pairs.append({
            "actorID": actor_obj["id"],
            "characterID": char_obj["id"]
        })

    # Randomly assign 1 or 2 or 3 genres
    movie_genres = random.sample(GENRES, k=random.randint(1, 3))

    movies_list.append({
        "id": f"movie_{i}",
        "title": f"Movie {i} - {fake.catch_phrase()}",
        "releaseYear": random.randint(1980, 2023),
        "duration": random.randint(80, 180),
        "rating": round(random.uniform(4.0, 9.9), 1),
        "directorID": director["id"],
        "actorCharacterPairs": actor_character_pairs,
        "genres": movie_genres
    })
movies = pd.DataFrame(movies_list)

In [10]:
# Now we have four lists: directors, actors, characters, movies
# This is our 'fake' dataset that hydrates the ontology.

In [11]:
# For demonstration, let's print out a few example entries
print("Sample Director:", directors_list[0])
print("Sample Actor:", actors_list[0])
print("Sample Character:", characters_list[0])
print("Sample Movie:", movies_list[0])

Sample Director: {'id': 'director_0', 'name': 'Carl Reyes', 'birthDate': '2005-04-04'}
Sample Actor: {'id': 'actor_0', 'name': 'Richard Patterson', 'birthDate': '1947-05-11'}
Sample Character: {'id': 'char_0', 'name': 'Mary'}
Sample Movie: {'id': 'movie_0', 'title': 'Movie 0 - Progressive bifurcated strategy', 'releaseYear': 1982, 'duration': 112, 'rating': 7.6, 'directorID': 'director_4', 'actorCharacterPairs': [{'actorID': 'actor_4', 'characterID': 'char_174'}, {'actorID': 'actor_36', 'characterID': 'char_57'}, {'actorID': 'actor_70', 'characterID': 'char_490'}], 'genres': ['Horror', 'Action', 'Documentary']}


In [12]:
directors.head()

Unnamed: 0,id,name,birthDate
0,director_0,Carl Reyes,2005-04-04
1,director_1,Jamie Daugherty,1976-12-13
2,director_2,Douglas Ballard,1941-09-13
3,director_3,Lisa Howard,1989-09-13
4,director_4,Shelby Miller,1948-05-29


In [13]:
actors.head()

Unnamed: 0,id,name,birthDate
0,actor_0,Richard Patterson,1947-05-11
1,actor_1,Cynthia Tucker,2020-02-21
2,actor_2,Kevin Miller,1949-06-08
3,actor_3,David Olson,1913-04-12
4,actor_4,Claire Schultz,1983-10-08


In [14]:
characters.head()

Unnamed: 0,id,name
0,char_0,Mary
1,char_1,Aaron
2,char_2,Elijah
3,char_3,Robin
4,char_4,Anthony


In [15]:
movies.head()

Unnamed: 0,id,title,releaseYear,duration,rating,directorID,actorCharacterPairs,genres
0,movie_0,Movie 0 - Progressive bifurcated strategy,1982,112,7.6,director_4,"[{'actorID': 'actor_4', 'characterID': 'char_1...","[Horror, Action, Documentary]"
1,movie_1,Movie 1 - Inverse mission-critical middleware,1994,177,8.3,director_14,"[{'actorID': 'actor_37', 'characterID': 'char_...","[Comedy, Horror]"
2,movie_2,Movie 2 - Business-focused secondary portal,2004,129,9.8,director_22,"[{'actorID': 'actor_11', 'characterID': 'char_...","[Documentary, Romance]"
3,movie_3,Movie 3 - Streamlined asymmetric productivity,2017,171,5.1,director_29,"[{'actorID': 'actor_25', 'characterID': 'char_...",[Comedy]
4,movie_4,Movie 4 - Virtual exuding product,2023,172,7.1,director_12,"[{'actorID': 'actor_62', 'characterID': 'char_...","[Drama, Romance, Sci-Fi]"


In [16]:
# persist the data
directors.to_csv(data_path+"directors.csv", encoding = "utf-8", escapechar = "\\", index=False)
actors.to_csv(data_path+"actors.csv", encoding = "utf-8", escapechar = "\\", index=False)
characters.to_csv(data_path+"characters.csv", encoding = "utf-8", escapechar = "\\", index=False)
movies.to_csv(data_path+"movies.csv", encoding = "utf-8", escapechar = "\\", index=False)