In [175]:
import pandas as pd
from json import loads

df = pd.read_csv("tmdb_5000_credits.csv")
df["cast"] = df["cast"].apply(loads)
df["crew"] = df["crew"].apply(loads)

In [176]:
df.head(1)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{'cast_id': 242, 'character': 'Jake Sully', '...","[{'credit_id': '52fe48009251416c750aca23', 'de..."


In [177]:
# Drop rows missing a crew and cast
df = df[~((df["crew"].str.len() == 0) | (df["cast"].str.len() == 0))]

In [178]:
df["cast"][0]

[{'cast_id': 242,
  'character': 'Jake Sully',
  'credit_id': '5602a8a7c3a3685532001c9a',
  'gender': 2,
  'id': 65731,
  'name': 'Sam Worthington',
  'order': 0},
 {'cast_id': 3,
  'character': 'Neytiri',
  'credit_id': '52fe48009251416c750ac9cb',
  'gender': 1,
  'id': 8691,
  'name': 'Zoe Saldana',
  'order': 1},
 {'cast_id': 25,
  'character': 'Dr. Grace Augustine',
  'credit_id': '52fe48009251416c750aca39',
  'gender': 1,
  'id': 10205,
  'name': 'Sigourney Weaver',
  'order': 2},
 {'cast_id': 4,
  'character': 'Col. Quaritch',
  'credit_id': '52fe48009251416c750ac9cf',
  'gender': 2,
  'id': 32747,
  'name': 'Stephen Lang',
  'order': 3},
 {'cast_id': 5,
  'character': 'Trudy Chacon',
  'credit_id': '52fe48009251416c750ac9d3',
  'gender': 1,
  'id': 17647,
  'name': 'Michelle Rodriguez',
  'order': 4},
 {'cast_id': 8,
  'character': 'Selfridge',
  'credit_id': '52fe48009251416c750ac9e1',
  'gender': 2,
  'id': 1771,
  'name': 'Giovanni Ribisi',
  'order': 5},
 {'cast_id': 7,
  'c

In [179]:
df["crew"][0]

[{'credit_id': '52fe48009251416c750aca23',
  'department': 'Editing',
  'gender': 0,
  'id': 1721,
  'job': 'Editor',
  'name': 'Stephen E. Rivkin'},
 {'credit_id': '539c47ecc3a36810e3001f87',
  'department': 'Art',
  'gender': 2,
  'id': 496,
  'job': 'Production Design',
  'name': 'Rick Carter'},
 {'credit_id': '54491c89c3a3680fb4001cf7',
  'department': 'Sound',
  'gender': 0,
  'id': 900,
  'job': 'Sound Designer',
  'name': 'Christopher Boyes'},
 {'credit_id': '54491cb70e0a267480001bd0',
  'department': 'Sound',
  'gender': 0,
  'id': 900,
  'job': 'Supervising Sound Editor',
  'name': 'Christopher Boyes'},
 {'credit_id': '539c4a4cc3a36810c9002101',
  'department': 'Production',
  'gender': 1,
  'id': 1262,
  'job': 'Casting',
  'name': 'Mali Finn'},
 {'credit_id': '5544ee3b925141499f0008fc',
  'department': 'Sound',
  'gender': 2,
  'id': 1729,
  'job': 'Original Music Composer',
  'name': 'James Horner'},
 {'credit_id': '52fe48009251416c750ac9c3',
  'department': 'Directing',
  

## Data preprocessing

In [180]:
directors = {(crew["id"], crew["name"]) for row in df["crew"] for crew in row if crew["job"] == "Director"}
list(directors)[:10]

[(15868, 'Uli Edel'),
 (76100, 'Ben Stassen'),
 (2042, 'Stephen Hopkins'),
 (65314, 'Giuseppe Tornatore'),
 (72197, 'Ferzan Ozpetek'),
 (586066, 'Doug Block'),
 (78160, 'Luca Guadagnino'),
 (11558, 'Eugène Lourié'),
 (62556, 'Fred Walton'),
 (71042, 'Jay Levey')]

In [181]:
director_relations = [(k, v) for k, v in {row["movie_id"]: director["id"] for i, row in df.iterrows() for director in reversed([crew for crew in row["crew"] if crew["job"] == "Director"])}.items()]
director_relations[:10]

[(19995, 2710),
 (285, 1704),
 (206647, 39),
 (49026, 525),
 (49529, 7),
 (559, 7623),
 (38757, 76595),
 (99861, 12891),
 (767, 11343),
 (209112, 15217)]

In [182]:
actors = [(cast["id"], cast["name"]) for row in df["cast"] for cast in row[:3]]
list(actors)[:10]

[(65731, 'Sam Worthington'),
 (8691, 'Zoe Saldana'),
 (10205, 'Sigourney Weaver'),
 (85, 'Johnny Depp'),
 (114, 'Orlando Bloom'),
 (116, 'Keira Knightley'),
 (8784, 'Daniel Craig'),
 (27319, 'Christoph Waltz'),
 (121529, 'Léa Seydoux'),
 (3894, 'Christian Bale')]

In [195]:
actor_relations = list(set([(row["movie_id"], actor["id"]) for i, row in df.iterrows() for actor in row["cast"][:3]]))
actor_relations[len(actor_relations) - 10:]

[(2069, 14408),
 (4379, 6352),
 (90, 776),
 (11888, 5563),
 (807, 287),
 (16353, 49609),
 (10152, 29020),
 (14758, 50095),
 (37028, 116619),
 (165864, 56676)]

## Output to CSV

In [184]:
import csv

In [185]:
def output(name, data, header):
    with open(name + '.csv', 'w') as f:
        writer = csv.writer(f, delimiter=',', lineterminator='\n')
        writer.writerow(header)
        for item in data:
            writer.writerow(item)

In [198]:
output("directors", directors, ('id', 'name'))
output("actors", actors, ('id', 'name'))
output("director_relations", director_relations, ('movie_id', 'director_id'))
output("actor_relations", actor_relations, ('movie_id', 'actor_id'))