# Shortest path between actors

What's the shortest path between two actors, via films they've acted together?

In [5]:
# Download the data

# curl -C - -O https://datasets.imdbws.com/name.basics.tsv.gz
# curl -C - -O https://datasets.imdbws.com/title.principals.tsv.gz
# curl -C - -O https://datasets.imdbws.com/title.basics.tsv.gz
# curl -C - -O https://datasets.imdbws.com/title.akas.tsv.gz

# Unzip the files, since the IMDB files have a bit of garbage at the end
# and Python's Pandas / gzip libraries can't read them.

In [6]:
# Load the titles
import pandas as pd
title = pd.read_csv('title.basics.tsv', sep='\t', low_memory=False).set_index('tconst')[['titleType', 'primaryTitle', 'startYear']]
title.head()

Unnamed: 0_level_0,titleType,primaryTitle,startYear
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
tt0000001,short,Carmencita,1894
tt0000002,short,Le clown et ses chiens,1892
tt0000003,short,Pauvre Pierrot,1892
tt0000004,short,Un bon bock,1892
tt0000005,short,Blacksmith Scene,1893


In [7]:
# Load the actors in each film
actors = pd.read_csv('title.principals.tsv', sep='\t')[['tconst', 'nconst', 'category']]
# Only consider actors, not directors, composers, etc. Shrinks data to about 40%
actors = actors[actors.category.isin({'actor', 'actress'})]
actors.head()

Unnamed: 0,tconst,nconst,category
11,tt0000005,nm0443482,actor
12,tt0000005,nm0653042,actor
16,tt0000007,nm0179163,actor
17,tt0000007,nm0183947,actor
21,tt0000008,nm0653028,actor


In [8]:
# Only consider movies, not TV series, etc. Shrinks data to ~5%
movies = title[title['titleType'] == 'movie']
actors = actors[actors['tconst'].isin(movies.index)]
# This is what the network looks like
actors.head()

Unnamed: 0,tconst,nconst,category
24,tt0000009,nm0063086,actress
25,tt0000009,nm0183823,actor
26,tt0000009,nm1309758,actor
531,tt0000335,nm1010955,actress
532,tt0000335,nm1012612,actor


In [9]:
# Load the name data
name = pd.read_csv('name.basics.tsv', sep='\t').set_index('nconst')[['primaryName', 'birthYear']]

In [10]:
# Create a networkx graph from this
import networkx as nx
G = nx.from_pandas_edgelist(actors, 'tconst', 'nconst')

Here are some actors whose network we could explore.

Bollywood Actors

- Rajinikanth: nm0707425
- Kamal Haasan
- Chiranjeevi
- Govinda
- Jitendra
- Brahmanandam

Hollywood Actresses

- Anjelina Jolie: nm0001401
- Scarlett Johansson
- Jessica Alba
- Emma Watson
- Julia Roberts

Crossovers

- Priyanka Chopra (Baywatch)
- Deepika Padukone (Return of Xander Cage)
- Irrfan Khan (Inferno, Jurassic World, The Amazing Spider-Man, A Mighty Heart)
- Anil Kapoor (Mission Impossible)
- Amrish Puri (Indiana Jones)
- Anupam Kher (Bend it Like Beckham, Silver Linings Playbook, Lust)
- Om Puri (City of Joy, Wolf, Ghost and the Darkness, Charlie Wilson's War)

In [11]:
def path(source, target):
    '''Returns the shortest path between two actors'''
    source = name[name['primaryName'] == source].index[0]
    target = name[name['primaryName'] == target].index[0]
    return names(nx.shortest_path(G, source, target))

def names(path):
    '''Converts IDs to movie titles or actor names'''
    return [
        (title['primaryTitle'][p] if p.startswith('tt') else name['primaryName'][p])
        for p in path
    ]

In [12]:
def paths(source, target):
    '''Returns all the shortest paths between two actors'''
    source = name[name['primaryName'] == source].index[0]
    target = name[name['primaryName'] == target].index[0]
    return [names(p) for p in nx.all_shortest_paths(G, source, target)]

In [13]:
# How can we connect Manorama and Angelina Jolie?
# This lists the actor - movie - actor - movie - ... - actor
path('Aachi Manorama', 'Angelina Jolie')

['Aachi Manorama',
 'Rikshavodu',
 'Paresh Rawal',
 'What If?',
 'Irrfan Khan',
 'A Mighty Heart',
 'Angelina Jolie']

In [14]:
# Rajinikanth is 2 people away from Angelina
path('Rajinikanth', 'Angelina Jolie')

['Rajinikanth',
 'Andhaa Kaanoon',
 'Amitabh Bachchan',
 'Piku',
 'Irrfan Khan',
 'A Mighty Heart',
 'Angelina Jolie']

In [15]:
# Brahmanandam is 3 people away from Katherine Hepburn
path('Brahmanandam', 'Katharine Hepburn')

['Brahmanandam',
 'Little Soldiers',
 'Rohini Hattangadi',
 'Gandhi',
 'John Gielgud',
 'Secret Agent',
 'Robert Young',
 'Spitfire',
 'Katharine Hepburn']

In [16]:
# Govinda is 2 people away from Angelina Jolie
path('Govinda', 'Angelina Jolie')

['Govinda',
 'Awaargi',
 'Anil Kapoor',
 'Besan',
 'Irrfan Khan',
 'A Mighty Heart',
 'Angelina Jolie']

In [17]:
# These are the paths from Gerard Butler and Priyanka Chopra
# (He wants to act with her)
len(paths('Gerard Butler', 'Priyanka Chopra')), path('Gerard Butler', 'Priyanka Chopra')

(39,
 ['Gerard Butler',
  "Na Nai'a: Legend of the Dolphins",
  'Whoopi Goldberg',
  'Incredible Love',
  'Akshay Kumar',
  'Style',
  'Priyanka Chopra'])

In [18]:
# Stallone wants to act with Salman
paths('Sylvester Stallone', 'Salman Khan')

[['Sylvester Stallone',
  'Incredible Love',
  'Akshay Kumar',
  'Mujhse Shaadi Karogi',
  'Salman Khan'],
 ['Sylvester Stallone',
  'Incredible Love',
  'Akshay Kumar',
  "Jaan-E-Mann: Let's Fall in Love... Again",
  'Salman Khan']]

In [19]:
# Kristen wants to act with Hrithik
paths('Kristen Stewart', 'Hrithik Roshan')

[['Kristen Stewart',
  'The Messengers',
  'Dylan McDermott',
  'The Mistress of Spices',
  'Aishwarya Rai Bachchan',
  'Dhoom 2',
  'Hrithik Roshan'],
 ['Kristen Stewart',
  'The Messengers',
  'Dylan McDermott',
  'The Mistress of Spices',
  'Aishwarya Rai Bachchan',
  'Jodhaa Akbar',
  'Hrithik Roshan'],
 ['Kristen Stewart',
  'The Messengers',
  'Dylan McDermott',
  'The Mistress of Spices',
  'Aishwarya Rai Bachchan',
  'Guzaarish',
  'Hrithik Roshan']]

In [20]:
# Daniel Radcliffe wants to act with Shah Rukh Khan
len(paths('Daniel Radcliffe', 'Shah Rukh Khan')), path('Daniel Radcliffe', 'Shah Rukh Khan')

(254,
 ['Daniel Radcliffe',
  "Harry Potter and the Sorcerer's Stone",
  'Richard Harris',
  'The Molly Maguires',
  'Sean Connery',
  'The Man Who Would Be King',
  'Saeed Jaffrey',
  'English Babu Desi Mem',
  'Shah Rukh Khan'])

In [21]:
# Let's explore the co-stars of Angelina (nm0001401)
from collections import Counter
c = Counter()
for movie in G['nm0001401']:
    for n in G[movie]:
        c[name.primaryName[n]] += 1

# These are the top co-stars across her 40 films
c.most_common(10)

[('Angelina Jolie', 40),
 ('Jack Black', 3),
 ('Dustin Hoffman', 3),
 ('Giovanni Ribisi', 2),
 ('Robert De Niro', 2),
 ('Brad Pitt', 2),
 ('Elle Fanning', 2),
 ('Bryan Cranston', 2),
 ('Jonny Lee Miller', 1),
 ('Jesse Bradford', 1)]

In [22]:
# Let's explore the co-stars of Sridevi (nm0004437)
c = Counter()
for movie in G['nm0004437']:
    for n in G[movie]:
        c[name.primaryName[n]] += 1
c.most_common(10)

[('Sridevi', 246),
 ('Krishna Ghattamaneni', 27),
 ('Rajinikanth', 24),
 ('Kamal Haasan', 23),
 ('Gummadi', 21),
 ('Satyanarayana Kaikala', 19),
 ('Akkineni Nageshwara Rao', 16),
 ('Jeetendra', 14),
 ('Taraka Rama Rao Nandamuri', 14),
 ('Jaya Prada', 13)]

In [23]:
# Let's explore the co-stars of Sridevi (nm0000821)
c = Counter()
for movie in G['nm0000821']:
    for n in G[movie]:
        c[name.primaryName[n]] += 1
c.most_common(10)

[('Amitabh Bachchan', 184),
 ('Hema Malini', 14),
 ('Shashi Kapoor', 12),
 ('Jaya Bhaduri', 10),
 ('Rekha', 10),
 ('Rakhee Gulzar', 10),
 ('Shatrughan Sinha', 8),
 ('Pran', 8),
 ('Parveen Babi', 8),
 ('Ajay Devgn', 8)]

# Analyzing the social network of actors

This script creates the network of actors to upload into [Kumu](https://kumu.io/sanand0/actor-pairs).

In [28]:
# Optionally, restrict data to just a single region (e.g. IN, US, etc)
# This loads the region for each title
region = pd.read_csv('title.akas.tsv', sep='\t').set_index('titleId')['region']
region.value_counts().head(10)

  interactivity=interactivity, compiler=compiler, result=result)


FR    2355333
DE    2316863
JP    2305757
ES    2305604
IT    2294063
IN    2258834
PT    2257105
US     865708
\N     456434
GB     185940
Name: region, dtype: int64

In [29]:
def get_pairs(lang=None, min_acted=25, min_pairs=4):
    '''
    Returns a DataFrame of actor pairs where:
    - Each actor has acted in at least min_acted films
    - The two actors have acted together in at least min_pairs films
    - And (optionally), belong to a region `lang` (IN, UN, etc)
    '''
    graph = actors
    if lang is not None:
        graph = graph[graph['tconst'].isin(region[region == lang].index)]
    name_freq = graph['nconst'].value_counts()
    top_names = name_freq[name_freq > min_acted]
    top_actors = graph[graph['nconst'].isin(top_names.index)]

    p = top_actors.copy()
    p['title'] = p['tconst'].astype('category')
    p['name'] = p['nconst'].astype('category')

    row = p['title'].cat.codes.values
    col = p['name'].cat.codes.values
    data = pd.np.ones(len(p), dtype='int')

    from scipy.sparse import csr_matrix
    matrix = csr_matrix((data, (row, col)))
    square = matrix.T * matrix
    square.setdiag(0)
    square = square.tocoo()

    pairs = pd.DataFrame({
        'row': square.row,
        'col': square.col,
        'n': square.data
    })
    pairs = pairs[pairs.n > min_pairs]

    cat = p['name'].cat.categories
    pairs = pd.concat([
        pairs,
        pairs.row.apply(lambda v: name.loc[cat[v]]),
        pairs.col.apply(lambda v: name.loc[cat[v]]),
    ],
                      axis=1)
    pairs = pairs.drop(columns=['row', 'col'])
    pairs.columns = ['count', 'name1', 'year1', 'name2', 'year2']
    return pairs.sort_values('count', ascending=False)

In [30]:
# Fix renames in lang=US
name.loc['nm0408868'] = name.loc['nm0408867']  # Milton Ingley

# Fix renames in lang=IN
name.loc['nm9347849'] = name.loc['nm8055229']  # K.V. Shanthi
name.loc['nm1275349'] = name.loc['nm1025717']  # Rajlakshmi Devi
name.loc['nm7940084'] = name.loc['nm1679381']  # Suruli Rajan
name.loc['nm6093386'] = name.loc['nm1153351']  # Sumithra

In [31]:
pairs = get_pairs(lang='IN', min_acted=10)

In [34]:
pairs[(pairs.year1 > '1980') & (pairs.year1 < '2020')].head(10)

Unnamed: 0,count,name1,year1,name2,year2
40476,17,Kavya Madhavan,1984,Dileep,1968
172579,12,Koyel Mallick,1982,Jeet,1978
51285,12,Prithviraj Sukumaran,1982,Jagathi Sreekumar,1951
98249,10,Trisha Krishnan,1983,Prakash Raj,1965
180911,9,Monalisa,1982,Pawan Singh,\N
68049,8,Monalisa,1982,Ravi Kishan,1971
98163,8,Tamannaah Bhatia,1989,Prakash Raj,1965
27578,8,Aarti Agarwal,1984,Brahmanandam,1956
132281,8,Prithviraj Sukumaran,1982,Indrajith Sukumaran,1980
149438,7,Kavya Madhavan,1984,Jayasurya,\N
