We create a directed graph. Each edge counts how many time the source beat the target. 

In [36]:
from IPython.display import display

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import json
from collections import defaultdict
from copy import deepcopy

import networkx as nx

pd.set_option('display.max_columns', None)
plt.style.use('ggplot')
# Show matplotlib plots in this notebook
%matplotlib inline
# Setting plot parameters
from pylab import rcParams
params = {
    'figure.figsize': (8, 8),
    'legend.fontsize': 15
}
rcParams.update(params)

In [37]:
JOBS_FILENAME = '../private/jobs-post-mapping.csv'
COOP = ['1', '2', '3', '4', '5', '6']
df = pd.read_csv(JOBS_FILENAME)
DRAW_PROPERTIES = {
    'node_color': '#333333',
    'font_size': 8,
    'node_size': 200,
    'edge_color': '#666666',
    'font_color': '#ffffff'
}

WRITE_PATH = '../private/web/'

In [45]:
def write_dict(filename, d):
    path = WRITE_PATH + filename
    with open(path, 'w') as f:
        f.write(json.dumps(d, indent=True))

In [46]:
def get_score(g, name):
    if isnan(name):
        return 0
    scores = []
    for i in g.neighbors(name):
        s = g[name][i].get(i, 0) + g[name][i].get(name, 0)
        my_score = g[name][i].get(name, 0)
        scores.append(my_score * 1.0 / s)

    return np.array(scores)

In [47]:
def isnan(a):
    return a != a

In [48]:
g = nx.Graph()

for index, row in df.iterrows():
    for c in COOP:
        v = row['coop_name_' + c]
        if isnan(v):
            continue
        g.add_node(v)
    if not isnan(row['fav_coop']):
        g.add_node(row['fav_coop'])
    
for index, row in df.iterrows():
    fav_coop = row['fav_coop']
    if not isnan(fav_coop):
        if 'fav_count' in g.nodes[fav_coop]:
            g.nodes[fav_coop]['fav_count'] += 1
        else:
            g.nodes[fav_coop]['fav_count'] = 1
    
    worked_for = []
    for c in COOP:
        coop = row['coop_name_' + c]
        if isnan(coop):
            # Ignore any coop terms missed
            continue
        if coop not in worked_for:
            worked_for.append(coop)
            if 'count' not in g.nodes[coop]:
                g.nodes[coop]['count'] = 1
            else:
                g.nodes[coop]['count'] += 1
        

        if coop == fav_coop or isnan(fav_coop):
            continue
        if g.has_edge(coop, fav_coop):
            if fav_coop not in g[coop][fav_coop]:
                g[coop][fav_coop][fav_coop] = 1
            else:
                g[coop][fav_coop][fav_coop] += 1
        else:
            g.add_edge(coop, fav_coop)
            g[coop][fav_coop][fav_coop] = 1

arr = []
for i, c1 in enumerate(g.nodes):
    if i == 0:
        # For some reason first node is a nan
        continue
        
    scores = get_score(g, c1)
    fav_count = g.nodes[c1].get('fav_count', 0)
    count = g.nodes[c1].get('count', 1)
    mean = 0
    if scores.size > 0:
        mean = np.nanmean(scores)
    arr.append([mean, fav_count * 1.0 / count, c1, fav_count, count])


Finding the most popular companies based on ratio of favorited vs worked at

In [54]:
popular = deepcopy(arr)
# popular = filter(lambda x: x[1] >= 0.5, arr)
popular.sort(key=lambda x: x[1], reverse=True)
popular = [[i[2], i[1], i[-1]] for i in popular]
for i in popular:
    pass
    print i
write_dict('normalized_fav_companies.json', {
    "data": popular,
    "metadata": {
    }
})

['Grand Rounds', 1.0, 1]
['Embark', 1.0, 2]
['Redfin', 1.0, 2]
['Snowflake Computing', 1.0, 1]
['Veyo', 1.0, 1]
['Groupon', 1.0, 1]
['Big Viking Games', 1.0, 1]
['Maisha Meds', 1.0, 1]
['Zenreach', 1.0, 1]
['Figma', 1.0, 1]
['Universe', 1.0, 1]
['Flexport', 1.0, 1]
['Drive.ai', 1.0, 2]
['Pebble', 1.0, 1]
['Wechat', 1.0, 1]
['Public safety', 1.0, 1]
['startup', 1.0, 1]
['Tableau', 1.0, 1]
['none', 1.0, 1]
['EA Games', 1.0, 1]
['Inflight Corp', 1.0, 1]
['Coherent Path', 1.0, 1]
['League Inc.', 1.0, 1]
['A Thinking Ape', 1.0, 1]
['Riot Games', 1.0, 1]
['Wealthsimple', 0.75, 4]
['Gametime', 0.75, 4]
['Square', 0.7, 10]
['Facebook', 0.5161290322580645, 31]
['Symphony Commerce', 0.5, 2]
['Eventbrite', 0.5, 2]
['Slack', 0.5, 2]
['Sumo Logic', 0.5, 2]
['Jane Street', 0.5, 2]
['Meraki', 0.5, 2]
['TextNow', 0.5, 2]
['NVIDIA', 0.5, 2]
['Lumotune', 0.5, 2]
['Voicebox', 0.5, 2]
['Cisco Meraki', 0.5, 2]
['Shopify', 0.5, 2]
['Zynga', 0.5, 2]
['PlaceIQ', 0.5, 2]
['Toyota', 0.5, 2]
['Uber', 0.5, 12]
['

In [50]:
big = deepcopy(arr)
big = filter(lambda x: x[4] >= 4, arr)
big.sort(key=lambda x: x[1], reverse=True)
for i in big[:5]:
    pass
#    print i[2]

In [51]:
# nx.draw_networkx(g, **DRAW_PROPERTIES)
# plt.axis('off')
# plt.show()
pass

Trying out a new method of ranking the companies

In [52]:
scores = defaultdict(int)
count = defaultdict(int)
for c in reversed(COOP):
    for index, row in df.iterrows():
        name = row['coop_name_' + c]
        count[name] += 1
        if c == '6':
            scores[name] += 30
        elif c == '5':
            scores[name] += 25
        elif c == '4':
            scores[name] += 20
        elif c == '3':
            scores[name] -= 4
            scores[name] /= 1.5
        elif c == '2':
            scores[name] -= 8
            scores[name] /= 1.5
        elif c == '1':
            scores[name] -= 12
            scores[name] /= 1.5

for i in count:
    divider = count[i] / 2 * 2
    scores[i] /= (divider + 1) * 1.0
    
for key, value in reversed(sorted(scores.iteritems(), key=lambda (k,v): (v,k))):
    print key, value

a startup 30.0
Universe 30.0
Stripe 30.0
OMF International 30.0
League Inc. 30.0
Inflight Corp 30.0
Home 30.0
Forensiq 30.0
EA Games 30.0
DeepSubconscious.ai 30.0
Citadel 30.0
Chan Zuckerberg Initiative 30.0
30 30.0
Embark 28.3333333333
Amazon 27.3333333333
Cisco Meraki 26.6666666667
A9 26.6666666667
Wealthsimple 26.0
Snap 26.0
Yext 25.0
XO Group Inc. 25.0
Wexin 25.0
Webotics 25.0
The Meet Group 25.0
Tableau 25.0
Solink Corp. 25.0
Snowflake Computing 25.0
Salesforce 25.0
Reflektion Inc 25.0
Quora 25.0
Nerdwallet 25.0
MosaixSoft Inc 25.0
Microsoft Canada 25.0
Memebox 25.0
Maisha Meds 25.0
Leanplum Inc. 25.0
Helpful 25.0
FutureAdvisor 25.0
Flybits 25.0
Evernote 25.0
Coherent Path 25.0
Broadway Technology 25.0
BitTitan 25.0
AlertDriving 25.0
AdRoll 25.0
A Thinking Ape 25.0
Square 23.4615384615
Noom 23.3333333333
NVIDIA 23.3333333333
Jane Street 23.3333333333
Nvidia 22.7272727273
Yahoo! 20.5555555556
yuja 20.0
theScore Inc 20.0
Zenreach 20.0
Zazzle 20.0
Veyo 20.0
Sumo Logic 20.0
Springboar

In [53]:
write_dict('company_work_count.json', {
    "data": count,
    "metadata": {}
})