# Analyzing the social network of actors

This script creates the network of actors to upload into [Kumu](https://kumu.io/sanand0/actor-pairs).

In [1]:
# Download the data

# curl -C - -O https://datasets.imdbws.com/name.basics.tsv.gz
# curl -C - -O https://datasets.imdbws.com/title.principals.tsv.gz
# curl -C - -O https://datasets.imdbws.com/title.basics.tsv.gz
# curl -C - -O https://datasets.imdbws.com/title.akas.tsv.gz

In [1]:
import gc
import pandas as pd
from scipy.sparse import csr_matrix

In [2]:
# Read relevant data from the titles
title = pd.read_csv('title.basics.tsv.gz', sep='\t', low_memory=False).set_index('tconst')[['titleType', 'primaryTitle', 'startYear']]
title.head()

Unnamed: 0_level_0,titleType,primaryTitle,startYear
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
tt0000001,short,Carmencita,1894
tt0000002,short,Le clown et ses chiens,1892
tt0000003,short,Pauvre Pierrot,1892
tt0000004,short,Un bon bock,1892
tt0000005,short,Blacksmith Scene,1893


In [3]:
# Read relevant data from the cast of the films
principals = pd.read_csv('title.principals.tsv.gz', sep='\t')[['tconst', 'nconst', 'category']]
principals.head()

Unnamed: 0,tconst,nconst,category
0,tt0000001,nm1588970,self
1,tt0000001,nm0005690,director
2,tt0000001,nm0374658,cinematographer
3,tt0000002,nm0721526,director
4,tt0000002,nm1335271,composer


In [4]:
# Only consider actors, not directors, composers, etc. Shrinks data to about 40%
actors = principals
actors = actors[actors.category.isin({'actor', 'actress'})]

In [5]:
# Only consider movies, not TV series, etc. Shrinks data to ~5%
movies = title[title['titleType'] == 'movie']
actors = actors[actors['tconst'].isin(movies.index)]

In [6]:
# Delete the original data to save memory
del principals

In [7]:
# Optionally, restrict data to just a single region (e.g. IN, US, etc)
# This loads the region for each title
region = pd.read_csv('title.akas.tsv.gz', sep='\t').set_index('titleId')['region']
region.value_counts().head(20)

  interactivity=interactivity, compiler=compiler, result=result)


FR     2355333
DE     2316863
JP     2305757
ES     2305604
IT     2294063
IN     2258834
PT     2257105
US      865708
\N      456434
GB      185940
XWW     114957
CA       94768
BR       73494
GR       69467
FI       61782
HU       55978
AU       53826
RU       52953
PL       47398
SE       45541
Name: region, dtype: int64

In [8]:
# Read name data. This maps the person ID (nconst) to their name and year of birth
name = pd.read_csv('name.basics.tsv.gz', sep='\t').set_index('nconst')[['primaryName', 'birthYear']]

In [13]:
# Fix renames in lang=US
name.loc['nm0408868'] = name.loc['nm0408867']  # Milton Ingley

In [27]:
# Fix renames in lang=IN
name.loc['nm9347849'] = name.loc['nm8055229']  # K.V. Shanthi
name.loc['nm1275349'] = name.loc['nm1025717']  # Rajlakshmi Devi
name.loc['nm7940084'] = name.loc['nm1679381']  # Suruli Rajan
name.loc['nm6093386'] = name.loc['nm1153351']  # Sumithra

In [28]:
# Optional: filter by region
pairs = get_pairs(lang='IN', min_acted=10, min_pairs=3)

In [29]:
pairs[(pairs.year1 > '1960') & (pairs.year1 < '2020')]

Unnamed: 0,count,name1,year1,name2,year2
172971,32,Siddhanta Mahapatra,1966,Bijoy Mohanty,\N
173124,32,Siddhanta Mahapatra,1966,Mihir Das,\N
31578,28,Prasenjit Chatterjee,1962,Subhendu Chatterjee,1936
43315,28,Jaya Prada,1962,Krishna Ghattamaneni,1944
27718,27,Ramya Krishnan,1970,Brahmanandam,1956
43325,26,Sridevi,1963,Krishna Ghattamaneni,1944
32380,25,Vijayshanti,1966,Chiranjeevi,1955
31453,25,Prasenjit Chatterjee,1962,Soumitra Chatterjee,1935
83199,24,Jaya Prada,1962,Mohan Babu,1950
27654,24,Soundarya,1972,Brahmanandam,1956


In [30]:
pairs.to_excel('pairs.IN.xlsx', index=False)