In [1]:
# These gzip files have trailing garbage.
# Python's gzip module does not read GZIP files with trailing garbage.
# Let's create an equivalent of pandas.read_csv() that works around it.
# See https://stackoverflow.com/a/54608126/100904
import zlib
import io
import pandas as pd

def read_csv(path, **kwargs):
    with open(path, 'rb') as handle:
        raw = handle.read()
    stream = io.BytesIO(zlib.decompress(raw, zlib.MAX_WBITS|16))
    return pd.read_csv(stream, **kwargs)

In [3]:
# Load the movies dataset
movies = read_csv('title.basics.tsv.gz', sep='\t', na_values='\\N', dtype={
    'tconst': 'str',
    'titleType': 'str',
    'primaryTitle': 'str',
    'startYear': 'Int64',
    'genres': 'str'
}).set_index('tconst')

# Filter for Bollywood movies released from 2005 onwards
bollywood_movies = movies[(movies['titleType'] == 'movie') & (movies['startYear'] >= 2005) & (movies['genres'].str.contains('Drama'))]


  return pd.read_csv(stream, **kwargs)


In [None]:
import networkx as nx

G = nx.Graph()
for movie, group in bollywood_cast.groupby('tconst'):
    actors = group['nconst'].tolist()
    for actor1 in actors:
        for actor2 in actors:
            if actor1 != actor2:
                G.add_edge(actor1, actor2)


In [4]:

# Load the cast dataset
cast = read_csv('title.principals.tsv.gz', sep='\t', na_values='\\N', dtype={
    'tconst': 'str',
    'nconst': 'str',
    'category': 'str'
})

# Filter for actors in the selected Bollywood movies
bollywood_cast = cast[(cast['category'].isin(['actor', 'actress'])) & (cast['tconst'].isin(bollywood_movies.index))]


: 

In [2]:
movies = read_csv('title.basics.tsv.gz', sep='\t', na_values='\\N', usecols=['tconst', 'titleType', 'primaryTitle', 'startYear', 'genres'])
cast = read_csv('title.principals.tsv.gz', sep='\t', na_values='\\N', usecols=['tconst', 'nconst', 'category'])


In [7]:
movies_new = read_csv('title.basics.tsv.gz', sep='\t', na_values='\\N', usecols=['tconst', 'titleType', 'primaryTitle', 'startYear', 'genres'], iterator=True, chunksize=10000)
movies_new = pd.concat([chunk[(chunk['titleType'] == 'movie') & (chunk['startYear'] >= 2005) & (chunk['genres'].str.contains('Drama'))] for chunk in movies_new])


In [11]:
bollywood_cast = cast[(cast['category'].isin(['actor', 'actress'])) & (cast['tconst'].isin(bollywood_movies.index))]


In [9]:
bollywood_movies = movies[(movies['titleType'] == 'movie') & (movies['startYear'] >= 2005) & (movies['genres'].str.contains('Drama'))]


In [12]:
import networkx as nx

G = nx.Graph()
for movie, group in bollywood_cast.groupby('tconst'):
    actors = group['nconst'].tolist()
    for actor1 in actors:
        for actor2 in actors:
            if actor1 != actor2:
                G.add_edge(actor1, actor2)


In [13]:
with open('bollywood_network.csv', 'w') as f:
    f.write('Source,Target\n')
    for edge in G.edges():
        f.write(f'{edge[0]},{edge[1]}\n')