### DATA 620
#### Project 2   
### [Video Presentation]()
##### Group Four
- Santosh Cheruku
- Vinicio Haro
- Javern Wilson
- Saayed Alam  

In [173]:
# load libraries
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import math
from networkx.algorithms import bipartite as bi
from collections import Counter
% matplotlib inline

In this assignment, we are asked to identify a large 2-node network dataset. The data meets the criteria that it consists of ties between and not within two (or more) distinct groups.

In [174]:
appear = pd.read_csv("https://raw.githubusercontent.com/saayedalam/Data/master/out.actor-movie",
                  header = None, skiprows = 1, sep = " ").rename(columns = {0: "movie", 1: "actor", 2: "appearance"})

Since the dataset has more than a million edges, we will first sample only 2% of the dataset. We also built a weight variable which is the number of appearances an actor has made in movies. The more appearances an actor has, the more experiences he is. 

In [162]:
appear['appearance'] = appear.groupby(['actor',]).actor.transform('count')
appear = appear.sample(frac = 0.2, random_state = 1)
appear.head()

Unnamed: 0,movie,actor,appearance
669792,58971,99903,5
408148,38329,55750,6
485488,44144,15249,71
697835,61268,86461,28
181724,20856,1940,28


We reduce the size of the network by selecting highest number of edges between nodes i.e. number of appearances by an actor. 

In [163]:
B = nx.Graph()

# Add nodes with the node attribute "bipartite"
B.add_nodes_from(list(appear.movie.values), bipartite=0)
B.add_nodes_from(list(appear.actor.values), bipartite=1)

# Add edges only between nodes of opposite node sets
subset = appear[['movie', 'actor', 'appearance']]
edges_tuples = [tuple(x) for x in subset.values]
B.add_weighted_edges_from(edges_tuples)

#nx.is_bipartite(B)

In [164]:
print(nx.info(B))

Name: 
Type: Graph
Number of nodes: 183099
Number of edges: 294076
Average degree:   3.2122


In [165]:
nx.is_connected(B)

False

In [166]:
nx.is_directed(B)

False

In [167]:
top_nodes = {n for n, d in B.nodes(data=True) if d['bipartite']==0}
bottom_nodes = set(B) - top_nodes

In [169]:
#####
## islands method
# taken from SNA chp 4 pg 64.  Modified appropriately.
########

def trim_edges(g, weight = 1):
    edges = []
    g2 = nx.Graph()
    for f, to, edata in g.edges(data=True):
        if edata['weight']> weight:
            x = float(edata['weight'])
            edges.append((f,to,x))
    g2.add_weighted_edges_from(edges)
    return g2


def island_method(g, iterations=5):
    weights = [edata['weight'] for f, to, edata in g.edges(data=True)]
    
    #size of step
    mn=int(min(weights))
    mx=int(max(weights))
    step = int((mx-mn)/iterations)
    
    return [[threshold, trim_edges(g, threshold)] for threshold in range(mn, mx, step)]

In [170]:
top_nodes_nx = nx.bipartite.weighted_projected_graph(B, top_nodes)
#cc = list(nx.connected_component_subgraphs(top_nodes_nx))[0]

In [171]:
island_method(B)

[[1, <networkx.classes.graph.Graph at 0x2090b9409e8>],
 [130, <networkx.classes.graph.Graph at 0x20906351d68>],
 [259, <networkx.classes.graph.Graph at 0x20906351320>],
 [388, <networkx.classes.graph.Graph at 0x2091389b860>],
 [517, <networkx.classes.graph.Graph at 0x2090b940630>]]

In [172]:
values = [['Threshold','Number of People']]
islands = island_method(B, iterations = 5)
for x in islands:
    threshold =x[0]
    num_people = len(x[1])
    #connections = len(list(nx.connected_component_subgraphs(x[1])))
    temp_list = [threshold, num_people]
    values.append(temp_list)
    
headers = values.pop(0)
islan_df = pd.DataFrame(values,columns = headers)
islan_df

Unnamed: 0,Threshold,Number of People
0,1,138870
1,130,10929
2,259,1424
3,388,607
4,517,128


In [82]:
[edata['weight'] for f, to, edata in B.edges(data=True)]

[1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0

In [52]:
a = [edata['weight'] for f, to, edata in B.edges(data=True)]

In [54]:
sum(a)

29408

In [818]:
len(list(nx.connected_component_subgraphs(B)))

3

What can you infer about each of the distinct groups?

In [None]:
W = bi.projected_graph(B, top_nodes)

nx.draw(W)