In [1]:
import networkx as nx
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import copy
from statistics import mean
import json
import pickle

In [2]:
metadata_df = pd.read_csv("data/network_metadata.tab", sep="\t")
metadata_df.loc[:, "IMDB_id"] = metadata_df["IMDB_id"].str[2:].astype('int')
# metadata_df.head()

In [3]:
imdb_data_df = pd.read_json("data/gexf_imdb_metadata.json", orient='index')
# imdb_data_df

In [4]:
df = pd.merge(metadata_df, imdb_data_df, left_on="IMDB_id", right_on="movie_id")
df.loc[:,'directors-writers'] = df.loc[:,'directors'] + df.loc[:,'writers']
for i in range(len(df)):
    df.loc[:,'directors-writers'][i] = list(set(df.loc[:,'directors-writers'][i]))
# df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [5]:
# A graph where nodes are movie titles and are conneted if they share a director or a writer.
Movies_Graph = nx.Graph()
titles = df.loc[:,'Title']
for title in titles:
    Movies_Graph.add_node(title)

    
# Add node if two movies share a director or a writer
# Edge weight = number of directors/writers in common 
for i in range(len(df)):
    for j in range(i+1, len(df)):
        title_i = titles[i]
        title_j = titles[j]
        dirwri_i = df.loc[i,'directors-writers']
        dirwri_j = df.loc[j,'directors-writers']
        intersection = set.intersection(set(dirwri_i), set(dirwri_j))
        if intersection:
            Movies_Graph.add_edge(title_i, title_j, weight=len(intersection))

In [52]:
'''
# Map node name to node_name, for saving
mapping = {}
for node in Movies_Graph:
    mapping[node] = str(node).replace(" ", "_")


H=nx.relabel_nodes(Movies_Graph, mapping)
nx.write_adjlist(H, "data/movies_graph.adjlist")
'''

In [53]:
'''
# make subset of G that is ~50 nodes
Movies_Graph_subset = copy.deepcopy(Movies_Graph)
i=0
for node in Movies_Graph:
    if Movies_Graph.degree(node) < 12:
        Movies_Graph_subset.remove_node(node)

# print(len(Movies_Graph))
# print(len(Movies_Graph_subset))
'''

In [147]:
## A graph where nodes are directors or writers and are connected if they have collaborated on a movie

# Get all unique directors and writers
writer_movies_map = {}
Writers_Graph = nx.Graph()
Directors_writers_unique = set()
for dw_list in df.loc[:,'directors-writers']:
    for dw in dw_list:
        Directors_writers_unique.add(dw)
        
# Add all writer nodes and make dictionary for every movie worked on by said writer
for dw in Directors_writers_unique:
    writer_movies_map[dw] = []
    Writers_Graph.add_node(dw)
    for i in range(len(df)):
        if dw in df.loc[i,'directors-writers']:
            writer_movies_map.get(dw).append(df.loc[i,'Title'])

In [163]:
director_movies_map = {}
directors_unique = set()
for d_list in df.loc[:,'directors']:
    for d in d_list:
        directors_unique.add(d)
        
for d in directors_unique:
    director_movies_map[d] = []
    for i in range(len(df)):
        if d in df.loc[i,'directors']:
            director_movies_map.get(d).append(df.loc[i,'Title'])

In [55]:
'''
writers = list(writer_movies_map.keys())

# Add edges between writers with weight equal to the number of movies they have collaborated on
for i in range(len(writers)):
    for j in range(i+1, len(writers)):
        writer_i = writers[i]
        writer_j = writers[j]
        movies_i = writer_movies_map[writer_i]
        movies_j = writer_movies_map[writer_j]
        intersection = set.intersection(set(movies_i), set(movies_j))
        if intersection:
            Writers_Graph.add_edge(writer_i, writer_j, weight=len(intersection))
'''

In [38]:
'''
# Map node name to node_name, for saving
mapping = {}
for node in Writers_Graph:
    mapping[node] = str(node).replace(" ", "_")


H=nx.relabel_nodes(Writers_Graph, mapping)
nx.write_adjlist(H, "data/writers_graph.adjlist")
'''

In [148]:
# For a given writer, compile some statistics about the movies he/she has made
def get_writer_data(writer="Steven Spielberg"):
    director = writer
    stats = {
        "director": director,
        "number_of_movies": 0,
        "number_of_characters": [],
        "number_of_communities": [],
        "clustering": [],
        "modularity": [],
        "diameter": [],
        "density": []
    }

    movies = director_movies_map[director]
    for movie in movies:
        for i in range(len(df)):
            if df.loc[i,"Title"] == movie:
                gexf_id = df.loc[i,"GexfID"]
                index = i
                break
        G = nx.read_gexf(f"data/gexf/{gexf_id}.gexf", relabel=True)
        # find out stuff about the graph here and save it
        stats["number_of_characters"].append(df.loc[index, "Characters"])
        stats["number_of_communities"].append(len(list(nx.algorithms.community.modularity_max.greedy_modularity_communities(G))))
        stats["clustering"].append(float(df.loc[index, "ClusteringCoefficient"].replace(',','.')))
        stats["modularity"].append(float(df.loc[index, "Modularity"].replace(',','.')))
        stats["diameter"].append(float(df.loc[index, "Diameter"].replace(',','.')))
        stats["density"].append(float(df.loc[index, "Density"].replace(',','.')))
        stats["number_of_movies"] =stats["number_of_movies"] + 1
    
    return stats


In [164]:
All_Directors = {}
for director in director_movies_map.keys():
    All_Directors[director]=get_writer_data(director)
with open('data/director_summary.p', 'wb') as fp:
    pickle.dump(All_Directors, fp, protocol=pickle.HIGHEST_PROTOCOL)