In [43]:
import pandas as pd
import networkx  as  nx
import matplotlib.pyplot as plt
import operator

data = pd.read_csv('casts.csv',delimiter=';', header=None, names =['ID','film_name','actor','type','role'])
print(f'Rows: {len(data)}')

Rows: 46233


In [44]:
data.describe()

Unnamed: 0,ID,film_name,actor,type,role
count,46233,46229,46205,46198,46167
unique,8631,8632,16614,277,21028
top,MiA8,Romeo and Juliet,s a,Und,RU:
freq,46,59,1661,42556,19558


In [0]:
film_actors = {}
graph = nx.Graph()

for index, row in data.iterrows():
    graph.add_node(row['actor'])
    
    if row['film_name'] in film_actors:
        for actor in film_actors[row['film_name']]:
            graph.add_edge(actor, row['actor'])
        
        film_actors[row['film_name']].append(row['actor'])
    else:
        film_actors[row['film_name']] = [row['actor']]

## General statistics 

In [46]:
nodes = len(graph.nodes)
edges = len(graph.edges)
density = (2*edges)/(nodes*(nodes-1))

print(f'Nodes: {nodes}')
print(f'Edges: {edges}')
print(f'Density: {density}')
print(f'Number of components: {nx.number_connected_components(graph)}')

Nodes: 16615
Edges: 156042
Density: 0.0011305686849167414
Number of components: 637


## Centralities 

### Degree centrality

In [47]:
degree_cen = nx.degree_centrality(graph)
sorted_x = sorted(degree_cen.items(), key=lambda kv: kv[1], reverse= True)
for i in sorted_x[:10]:
    print(f'{i[0]} - {i[1]}')

s a - 0.19934994582881907
Humphrey Bogart - 0.025941976646201997
James Stewart - 0.022511135187191524
Gary Cooper - 0.022270374383050438
John Gielgud - 0.022270374383050438
John Carradine - 0.022089803779944624
Peter Lorre - 0.021668472372697724
C.Aubrey Smith - 0.02028409774888648
Henry Fonda - 0.019622005537498495
Burt Lancaster - 0.018779342723004695


### Eigenvector centrality

In [48]:
eigenvector_centrality = nx.eigenvector_centrality(graph)
sorted_x = sorted(eigenvector_centrality.items(), key=lambda kv: kv[1], reverse= True)
for i in sorted_x[:10]:
    print(f'{i[0]} - {i[1]}')

s a - 0.32927013296535745
C.Aubrey Smith - 0.08662877894729215
John Carradine - 0.08522052158966704
James Stewart - 0.08332664155998606
John Gielgud - 0.08103511480945194
Peter Lorre - 0.07850365590146662
Gary Cooper - 0.0774399525342619
Basil Rathbone - 0.07489727554123304
Henry Fonda - 0.07489229044218115
Humphrey Bogart - 0.07423319561996228


### Communities

In [49]:
communities = {node:cid+1 for cid,community in enumerate(nx.algorithms.community.k_clique_communities(graph,3)) for node in community}
sorted_x = sorted(communities.items(), key=lambda kv: kv[1], reverse= True)
for i in sorted_x[:10]:
    print(f'{i[0]} - {i[1]}')

John Agar - 1156
Kiki - 1156
Rosemarie Bowe - 1156
Fernando Rey - 1155
Steve Reeves - 1155
Christine Kaufman - 1155
Jason Miller - 1154
Christine Lahti - 1154
David Spielberg - 1154
Anna Magnani - 1153


## Kevin Bacon

In [30]:
not_connected = 0
path_lens = {}
for actor in data.actor.unique():
  try:
    path_len = len(nx.shortest_path(graph, actor, 'Kevin Bacon'))
  except nx.NetworkXNoPath:
    not_connected += 1
  path_lens[actor] = path_len - 1

max_number = sorted(path_lens.items(), key=lambda kv: kv[1], reverse= True)
min_number = sorted(path_lens.items(), key=lambda kv: kv[1])
  
print(f'Min. number: {min(path_lens.values())}')
print(f'Max. number: {max(path_lens.values())}')
print(f'Avg. number: {sum(path_lens.values()) / len(path_lens.values())}')
print(f'Not connected: {not_connected}')

print('\nMin:')
for actor, number  in  min_number[:5]:
  print(f'{actor} - {number}')
  
print('\nMax:')
for actor, number  in  max_number[:5]:
  print(f'{actor} - {number}')

Min. number: 0
Max. number: 6
Avg. number: 2.820403250075233
Not connected: 1553

Min:
Kevin Bacon - 0
s a - 1
Joe Pesci - 1
Meryl Streep - 1
Gary Oldman - 1

Max:
Elisa Touati - 6
Marbel Verdu - 6
Maria deMederios - 6
Robert Castle - 6
Barbara Dennek - 6


### Output - only scientist

In [0]:
data = data[data.type == 'Sci']

film_actors = {}
graph = nx.Graph()

for index, row in data.iterrows():
    graph.add_node(row['actor'])
    
    if row['film_name'] in film_actors:
        for actor in film_actors[row['film_name']]:
            graph.add_edge(actor, row['actor'])
        
        film_actors[row['film_name']].append(row['actor'])
    else:
        film_actors[row['film_name']] = [row['actor']]

In [0]:
nx.write_gexf(graph, 'output_sci_graph.gexf')