In [None]:
!pip install scikit-network
import pandas as pd
import numpy as np
import sknetwork.clustering
import sknetwork.utils
from scipy.sparse import csr_matrix



In [None]:
# Download the data
!rm -f *.tsv.gz
!wget -q https://datasets.imdbws.com/name.basics.tsv.gz
!wget -q https://datasets.imdbws.com/title.principals.tsv.gz
!wget -q https://datasets.imdbws.com/title.basics.tsv.gz
!wget -q https://datasets.imdbws.com/title.akas.tsv.gz
!ls -la

total 977300
drwxr-xr-x 1 root root      4096 Oct 26 17:33 .
drwxr-xr-x 1 root root      4096 Oct 26 17:24 ..
drwxr-xr-x 4 root root      4096 Oct  8 13:44 .config
-rw-r--r-- 1 root root 220718368 Oct 26 13:31 name.basics.tsv.gz
drwxr-xr-x 1 root root      4096 Oct  8 13:45 sample_data
-rw-r--r-- 1 root root 253120904 Oct 26 13:31 title.akas.tsv.gz
-rw-r--r-- 1 root root 147415282 Oct 25 13:21 title.basics.tsv.gz
-rw-r--r-- 1 root root 379458664 Oct 26 13:31 title.principals.tsv.gz


In [None]:
# Load the titles
title = pd.read_csv('title.basics.tsv.gz', sep='\t', low_memory=False).set_index('tconst')[['titleType', 'primaryTitle', 'startYear']]
title.head()

Unnamed: 0_level_0,titleType,primaryTitle,startYear
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
tt0000001,short,Carmencita,1894
tt0000002,short,Le clown et ses chiens,1892
tt0000003,short,Pauvre Pierrot,1892
tt0000004,short,Un bon bock,1892
tt0000005,short,Blacksmith Scene,1893


In [None]:
# Load the cast of each film
cast = pd.read_csv('title.principals.tsv.gz', sep='\t')[['tconst', 'nconst', 'category']]
# Only consider actors, not directors, composers, etc. Shrinks data to about 40%
cast = cast[cast.category.isin({'actor', 'actress'})]
cast.head()

Unnamed: 0,tconst,nconst,category
11,tt0000005,nm0443482,actor
12,tt0000005,nm0653042,actor
16,tt0000007,nm0179163,actor
17,tt0000007,nm0183947,actor
21,tt0000008,nm0653028,actor


In [None]:
# Only consider movies, not TV series, etc. Shrinks data to ~5%
movies = title[title['titleType'] == 'movie']
cast = cast[cast['tconst'].isin(movies.index)]
# This is what the network looks like
cast.head()

Unnamed: 0,tconst,nconst,category
848,tt0000502,nm0215752,actor
849,tt0000502,nm0252720,actor
1099,tt0000591,nm0906197,actor
1100,tt0000591,nm0332182,actor
1101,tt0000591,nm1323543,actress


In [None]:
# Explore the regions we have data for (e.g. IN, US, etc)
region = pd.read_csv('title.akas.tsv.gz', sep='\t', low_memory=False).set_index('titleId')['region']
region.value_counts().head(10)

FR    3487957
JP    3482526
DE    3445185
ES    3412919
IN    3406529
IT    3395074
PT    3336232
\N    1939891
US    1303011
GB     405490
Name: region, dtype: int64

In [None]:
# Load the name data along with birth year
name = pd.read_csv('name.basics.tsv.gz', sep='\t', na_values='\\N', dtype={'birthYear': float}).set_index('nconst')[['primaryName', 'birthYear']]
name_freq = cast['nconst'].value_counts()

In [None]:
def get_pairs(lang=None, min_acted=25, min_pairings=4):
    '''
    Returns an adjacency matrix and actor mapping of actor pairs where:
    - Each actor has acted in at least min_acted films
    - The two actors have acted together in at least min_pairings films
    - And (optionally), belong to a region `lang` (IN, UN, etc)
    '''
    graph = cast
    if lang is not None:
        graph = graph[graph['tconst'].isin(region[region == lang].index)]
    top_names = name_freq[name_freq >= min_acted]
    top_actors = graph[graph['nconst'].isin(top_names.index)]

    p = top_actors.copy()
    p['title'] = p['tconst'].astype('category')
    p['name'] = p['nconst'].astype('category')

    row = p['title'].cat.codes.values
    col = p['name'].cat.codes.values
    data = np.ones(len(p), dtype='int')

    matrix = csr_matrix((data, (row, col)))
    square = matrix.T * matrix
    square.setdiag(0)
    square = square.tocoo()

    pairs = pd.DataFrame({
        'row': square.row,
        'col': square.col,
        'n': square.data
    })
    pairs = pairs[pairs.n >= min_pairings].reset_index(drop=True)
    return pairs, name.reindex(p['name'].cat.categories)

def lookup(pairs, cat):
    pairs = pd.concat([
        pairs,
        cat.iloc[pairs.row].reset_index(drop=True),
        cat.iloc[pairs.col].reset_index(drop=True),
    ], axis=1)
    pairs = pairs.drop(columns=['row', 'col'])
    pairs.columns = ['count', 'name1', 'year1', 'name2', 'year2']
    return pairs.sort_values('count', ascending=False)

In [None]:
pairs_in, cat_in = get_pairs(lang='IN', min_acted=3, min_pairings=1)
lookup(pairs_in, cat_in)

Unnamed: 0,count,name1,year1,name2,year2
61465,197,Adoor Bhasi,1929.0,Bahadur,
41395,197,Bahadur,,Adoor Bhasi,1929.0
41375,171,Prem Nazir,1926.0,Adoor Bhasi,1929.0
204405,171,Adoor Bhasi,1929.0,Prem Nazir,1926.0
137831,153,Adoor Bhasi,1929.0,Jayabharati,
...,...,...,...,...,...
184880,1,Vic Trevino,1960.0,Jillian McWhirter,1962.0
184879,1,Sylvester Stallone,1946.0,Graham McTavish,1961.0
184878,1,Julie Benz,1972.0,Graham McTavish,1961.0
184877,1,Matthew Marsden,1973.0,Graham McTavish,1961.0


In [None]:
pairs_us, cat_us = get_pairs(lang='US', min_acted=3, min_pairings=1)
lookup(pairs_us, cat_us)

Unnamed: 0,count,name1,year1,name2,year2
469858,59,Leo Gorcey,1917.0,Huntz Hall,1920.0
448659,59,Huntz Hall,1920.0,Leo Gorcey,1917.0
44653,51,Tom Byron,1961.0,Peter North,1957.0
68840,51,Peter North,1957.0,Tom Byron,1961.0
255316,50,Gene Autry,1907.0,Smiley Burnette,1911.0
...,...,...,...,...,...
455971,1,Laura Hope Crews,1879.0,Shirley Grey,1902.0
455970,1,Zasu Pitts,1894.0,Shirley Grey,1902.0
455969,1,Slim Summerville,1892.0,Shirley Grey,1902.0
455968,1,John Halliday,1884.0,Shirley Grey,1902.0


In [None]:
pairs_in

Unnamed: 0,row,col,n,rowcluster,colcluster
0,12156,0,1,0,0
1,10326,0,1,0,0
2,1447,0,1,0,0
3,3874,0,1,32,0
4,1518,0,1,32,0
...,...,...,...,...,...
497315,29544,29994,1,2,2
497316,27669,29994,1,2,2
497317,27312,29994,1,2,2
497318,18339,29994,1,2,2


In [None]:
algo = sknetwork.clustering.Louvain()
adjacency = sknetwork.utils.edgelist2adjacency(pairs_in)
labels = algo.fit_transform(adjacency)
clusters_in = pd.concat([
    cat_in.reset_index(),
    pd.Series(labels, name='cluster')], axis=1)

clusters_in = pd.concat([
    cat_in.reset_index(),
    pd.Series(labels, name='cluster'),
    pd.Series(clusters_in['index'].map(name_freq), name='freq'),
], axis=1)
clusters_in


Unnamed: 0,index,primaryName,birthYear,cluster,freq
0,nm0000001,Fred Astaire,1899.0,0,35
1,nm0000002,Lauren Bacall,1924.0,0,37
2,nm0000003,Brigitte Bardot,1934.0,0,35
3,nm0000004,John Belushi,1949.0,0,7
4,nm0000006,Ingrid Bergman,1915.0,0,42
...,...,...,...,...,...
29990,nm9986430,Sebastian Cabanas,,157,3
29991,nm9987095,Santo Krishnan,,5,15
29992,nm9989234,Happy BanMajra,,10,4
29993,nm9990758,M.S. Namboothiri,,5,6


In [None]:
clusters_in[clusters_in['cluster']==0].sort_values('freq', ascending=False).head(20)

Unnamed: 0,index,primaryName,birthYear,cluster,freq
501,nm0000616,Eric Roberts,1956.0,0,291
7120,nm0369058,Raymond Hatton,1887.0,0,201
304,nm0000367,Gérard Depardieu,1948.0,0,159
422,nm0000514,Michael Madsen,1957.0,0,158
14830,nm0945189,Simon Yam,1955.0,0,153
1308,nm0001744,Tom Sizemore,1961.0,0,139
63,nm0000078,John Wayne,1907.0,0,139
5935,nm0273178,Fernando Fernán Gómez,1921.0,0,139
8928,nm0490489,Andy Lau,1961.0,0,135
1348,nm0001803,Danny Trejo,1944.0,0,131


In [None]:
clusters_in[clusters_in['cluster']==1].sort_values('freq', ascending=False).head(20)

Unnamed: 0,index,primaryName,birthYear,cluster,freq
4192,nm0149822,Mithun Chakraborty,1950.0,1,326
2052,nm0007106,Shakti Kapoor,1952.0,1,318
10515,nm0613417,Raza Murad,1950.0,1,241
8680,nm0474820,Kiran Kumar,1953.0,1,230
8355,nm0451600,Anupam Kher,1955.0,1,207
7945,nm0430803,Mohan Joshi,1945.0,1,206
20789,nm2147526,Asrani,1941.0,1,199
1612,nm0004109,Gulshan Grover,1955.0,1,198
633,nm0000821,Amitabh Bachchan,1942.0,1,192
2766,nm0044796,Raj Babbar,1952.0,1,177


In [None]:
clusters_in[clusters_in['cluster']==2].sort_values('freq', ascending=False).head(20)

Unnamed: 0,index,primaryName,birthYear,cluster,freq
3627,nm0103977,Brahmanandam,1956.0,2,797
11631,nm0695177,Prakash Raj,1965.0,2,246
2281,nm0019382,Ali,1968.0,2,243
10689,nm0621937,Nassar,1958.0,2,222
3285,nm0080238,Tanikella Bharani,1954.0,2,189
11841,nm0707399,Rajendra Prasad,1956.0,2,181
4395,nm0158112,Chiranjeevi,1955.0,2,178
16394,nm1129966,Goundamani,,2,177
12629,nm0766470,Sathyaraj,1954.0,2,173
8619,nm0471447,Ramya Krishnan,1970.0,2,163


In [None]:
clusters_in[clusters_in['cluster']==3].sort_values('freq', ascending=False).head(20)

Unnamed: 0,index,primaryName,birthYear,cluster,freq
1998,nm0006369,Ashok Kumar,1911.0,3,278
7192,nm0374974,Helen,1938.0,3,271
1634,nm0004429,Dharmendra,1935.0,3,270
11634,nm0695199,Pran Sikand,1920.0,3,258
2775,nm0045119,Aruna Irani,1946.0,3,248
7801,nm0420092,Jeevan,1915.0,3,227
3272,nm0080173,Master Bhagwan,1913.0,3,226
11335,nm0667985,Lalita Pawar,1916.0,3,214
12326,nm0747131,Nirupa Roy,1931.0,3,211
4416,nm0159159,Prem Chopra,1935.0,3,198


In [None]:
def connectedness(clusters, pairs, cat):
    pairs['rowcluster'] = clusters.iloc[pairs.row].cluster.reset_index(drop=True)
    pairs['colcluster'] = clusters.iloc[pairs.col].cluster.reset_index(drop=True)

    connectedness, coclusters = {}, {}
    for index, costars in pairs.groupby('row'):
        coclusters[cat.index[index]] = clusterdist = costars.groupby('colcluster')['n'].sum()
        selfcluster = costars.rowcluster.iloc[0]
        connectedness[cat.index[index]] = {
            'primaryName': cat.primaryName.iloc[index],
            # Which cluster do they belong to
            'cluster': selfcluster,
            # No of clusters they've acted at least 5 times with
            'nclusters': (clusterdist >= 5).sum(),
            # No of films they've acted in
            'titles': name_freq[cat.index[index]],
            # No of pairings they've had with other stars
            'pairings': clusterdist.sum(),
            # % of films within cluster
            'incluster': clusterdist.get(selfcluster, 0) / clusterdist.sum()
        }
    coclusters = pd.DataFrame(coclusters).T
    connectedness = pd.DataFrame(connectedness).T
    return connectedness.sort_values('incluster'), coclusters

In [None]:
connected_in, coclusters_in = connectedness(clusters_in, pairs_in, cat_in)

In [None]:
# Who are the big crossover actors in IN?
connected_in[connected_in['titles'] > 50].sort_values('incluster').head(20)

Unnamed: 0,primaryName,cluster,nclusters,titles,pairings,incluster
nm0534863,Madhavi,6,6,126,492,0.180894
nm0838517,Sumalatha,5,5,61,245,0.318367
nm0003110,Cesar Romero,32,0,75,3,0.333333
nm1383984,Rachana Banerjee,12,5,89,297,0.380471
nm0761100,Fernando Sancho,0,1,79,13,0.384615
nm0439784,Girish Karnad,1,7,76,259,0.389961
nm0707425,Rajinikanth,8,6,161,597,0.390285
nm0482285,Lakshmi,8,6,226,959,0.395203
nm0906226,Zarina Wahab,1,5,62,248,0.403226
nm0419666,Jayamalini,8,6,105,510,0.405882
