We create a directed graph. Each edge counts how many time the source beat the target. 

In [2]:
from IPython.display import display

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import json
from collections import defaultdict
from copy import deepcopy

import networkx as nx

pd.set_option('display.max_columns', None)
plt.style.use('ggplot')
# Show matplotlib plots in this notebook
%matplotlib inline
# Setting plot parameters
from pylab import rcParams
params = {
    'figure.figsize': (8, 8),
    'legend.fontsize': 15
}
rcParams.update(params)

In [3]:
JOBS_FILENAME = '../private/jobs-post-mapping.csv'
COOP = ['1', '2', '3', '4', '5', '6']
df = pd.read_csv(JOBS_FILENAME)
DRAW_PROPERTIES = {
    'node_color': '#333333',
    'font_size': 8,
    'node_size': 200,
    'edge_color': '#666666',
    'font_color': '#ffffff'
}


In [4]:
def get_score(g, name):
    if isnan(name):
        return 0
    scores = []
    for i in g.neighbors(name):
        s = g[name][i].get(i, 0) + g[name][i].get(name, 0)
        my_score = g[name][i].get(name, 0)
        scores.append(my_score * 1.0 / s)

    return np.array(scores)

In [5]:
def isnan(a):
    return a != a

In [9]:
g = nx.Graph()

for index, row in df.iterrows():
    for c in COOP:
        v = row['coop_name_' + c]
        if isnan(v):
            continue
        g.add_node(v)
    if not isnan(row['fav_coop']):
        g.add_node(row['fav_coop'])
    
for index, row in df.iterrows():
    fav_coop = row['fav_coop']
    if not isnan(fav_coop):
        if 'fav_count' in g.nodes[fav_coop]:
            g.nodes[fav_coop]['fav_count'] += 1
        else:
            g.nodes[fav_coop]['fav_count'] = 1
    
    worked_for = []
    for c in COOP:
        coop = row['coop_name_' + c]
        if isnan(coop):
            # Ignore any coop terms missed
            continue
        if coop not in worked_for:
            worked_for.append(coop)
            if 'count' not in g.nodes[coop]:
                g.nodes[coop]['count'] = 1
            else:
                g.nodes[coop]['count'] += 1
        

        if coop == fav_coop or isnan(fav_coop):
            continue
        if g.has_edge(coop, fav_coop):
            if fav_coop not in g[coop][fav_coop]:
                g[coop][fav_coop][fav_coop] = 1
            else:
                g[coop][fav_coop][fav_coop] += 1
        else:
            g.add_edge(coop, fav_coop)
            g[coop][fav_coop][fav_coop] = 1

arr = []
for i, c1 in enumerate(g.nodes):
    if i == 0:
        # For some reason first node is a nan
        continue
        
    scores = get_score(g, c1)
    fav_count = g.nodes[c1].get('fav_count', 0)
    count = g.nodes[c1].get('count', 1)
    mean = 0
    if scores.size > 0:
        mean = np.nanmean(scores)
    arr.append([mean, fav_count * 1.0 / count, c1, fav_count, count])


Finding the most popular companies based on ratio of favorited vs worked at

In [10]:
popular = deepcopy(arr)
popular = filter(lambda x: x[1] >= 0.5, arr)
popular.sort(key=lambda x: x[1], reverse=True)
for i in popular:
    pass
#    print i

In [11]:
big = deepcopy(arr)
big = filter(lambda x: x[4] >= 4, arr)
big.sort(key=lambda x: x[1], reverse=True)
for i in big[:5]:
    pass
#    print i[2]

In [12]:
# nx.draw_networkx(g, **DRAW_PROPERTIES)
# plt.axis('off')
# plt.show()
pass