# Calculating graph data samples to be merged with baseline features

### 1. Load in Crunchbase dataframes.
### 2. Select date and create network dataframes.
### 3. Save network dataframes as CSVs, and load them back in as `turicreate` SFrames.
    Crunchbase network: files/output/network_sframes/cb/{}_df.csv
    Pledge 1% network: files/output/network_sframes/p1/{}_df.csv
    Model network: files/output/network_sframes/model/{}_df.csv
    Not Pledge 1% network: files/output/network_sframes/np1/{}_df.csv
### 4. Load SFrames into model graph and remove duplicate edges. Produce eight graphs $(2^3)$ of Crunchbase that include edges with/without weights$_1$, multiple edges$_2$, and/or two directions$_3$.
### 5. Reduce size of Crunchbase vertices by limiting degrees of freedom from Pledge 1% companies, and save the vertices list for a few different network sizes. 
    Saved to: files/output/sample_vertices/
### 6. Produce 100 samples of the Crunchbase graph vertices,and save to CSV. These will be merged with the baseline model features.
    5 degrees away from Pledge 1% companies
        Baseline: files/output/model_csvs/Model_DF_D5/B/{}.csv
        Baseline Reduced: files/output/model_csvs/Model_DF_D5/BR/{}.csv
        Graph & Baseline: files/output/model_csvs/Model_DF_D5/GB/{}.csv
        Graph & Baseline Reduced: files/output/model_csvs/Model_DF_D5/GBR/{}.csv
        Graph: files/output/model_csvs/Model_DF_D5/G/{}.csv
    4 degrees away from Pledge 1% companies
        Baseline: files/output/model_csvs/Model_DF_D4/B/{}.csv
        Baseline Reduced: files/output/model_csvs/Model_DF_D4/BR/{}.csv
        Graph & Baseline: files/output/model_csvs/Model_DF_D4/GB/{}.csv
        Graph & Baseline Reduced: files/output/model_csvs/Model_DF_D4/GBR/{}.csv
        Graph: files/output/model_csvs/Model_DF_D4/G/{}.csv


## **Model**
`p1_tag` ~ `rank` + `total_funding_usd` + `age` + `employee_count` (ordinal) + `continent` (nominal, 8 indicator columns) + `industry` (nominal, 46 indicator columns) + **ADDITIONAL GRAPH FEATURES**

In [2]:
# Add 'graph' environment to PATH
import sys
sys.path.append('/home/ski/anaconda3/envs/graph/lib/python3.8/site-packages')

# User defined functions
import base_methods
from base_methods import load_the_csvs
import graph_methods
from graph_methods import network_by_date, load_vertices, find_p1_affiliations, load_edges, make_graph
import feature_methods
from feature_methods import feature_creation, add_pagerank, add_weighted_pagerank, add_shortest_path
from feature_methods import add_weighted_shortest_path, add_kcore, add_degree, add_triangle
from feature_methods import update_pagerank_weight, update_pagerank_reset_prob, update_pagerank_prev_to_current
from feature_methods import sum_weight, make_pagerank_zero, update_l1_delta, normalize_weight, pagerank_weighted

# Import data analysis packages
import numpy as np
import pandas as pd
import csv
import warnings
import os
import time
import math
from importlib import reload
from functools import reduce
from datetime import datetime
warnings.filterwarnings('ignore')

# Graph
import networkx as nx
from pyvis.network import Network
import turicreate
from turicreate import pagerank, kcore, degree_counting, shortest_path, connected_components, triangle_counting
from turicreate import SFrame, SGraph, SArray, load_sgraph, aggregate 

def update_cb_weights(src, edge, dst):
    if src['__id'] != dst['__id']: # ignore self-links
        edge['weight'] = 0
        edge['weight_status'] = 0
        edge['weight_type'] = 0
        if edge['status'] == 'primary':
            edge['weight_status'] = 3
        if edge['status'] == 'secondary':
            edge['weight_status'] = 2
        if edge['status'] == 'tertiary':
            edge['weight_status'] = 1
        if edge['__edge_type'] == 'job':
            edge['weight_type'] = 1
        if edge['__edge_type'] == 'investment':
            edge['weight_type'] = 2
        edge['weight'] = edge['weight_status'] * edge['weight_type']
    return (src, edge, dst)
#cb = cb.triple_apply(update_cb_weights, ['weight'])

# 1. Load in Crunchbase dataframes.

In [9]:
# Store path to notebook
PWD = !pwd
PWD = PWD[0]

# Set paths to data folders
INPUT = PWD + '/files/csv/'
OUTPUT = PWD + '/files/output/'
NETWORK_SFRAMES = OUTPUT + 'network_sframes/'
CRUNCHBASE_GRAPHS = OUTPUT + 'CrunchbaseGraphs/'
SAMPLE_VERTICES = OUTPUT + 'sample_vertices/'
MODEL_CSVS = OUTPUT + 'model_csvs/'

# Make sure those folders exist already
!mkdir {INPUT}
!mkdir {OUTPUT}
!mkdir {NETWORK_SFRAMES}
!mkdir {NETWORK_SFRAMES}p1
!mkdir {NETWORK_SFRAMES}cb
!mkdir {NETWORK_SFRAMES}model
!mkdir {NETWORK_SFRAMES}np1
!mkdir {CRUNCHBASE_GRAPHS}
!mkdir {SAMPLE_VERTICES}
!mkdir {MODEL_CSVS}

# Create subfolders for final model_csvs -- COMMENT OUT WHEN COMPLETED
# neighborhoods_name = ['Model_DF_D2', 'Model_DF_D3', 'Model_DF_D4', 'Model_DF_D5', 'Model_DF_ALL']
# types = ['B', 'BR', 'G', 'GB','GBR']
# for folder in neighborhoods_name:
#     for subfolder in types:
#         os.makedirs(os.path.join(MODEL_CSVS, folder, subfolder))

# Load
df,jobs,invest,invest_prtnr = load_the_csvs(loc=OUTPUT, 
                                            data=['organizations_merged','p1_jobs',
                                                  'p1_investments','p1_investments_partner'], 
                                            verbose=True)

print('Pledge 1% UUID: {}'.format(df[df['name']=='Pledge 1%'].uuid.values[0]))

mkdir: cannot create directory ‘/home/ski/Desktop/crunchbase-p1-machine-learning/files/csv/’: File exists
mkdir: cannot create directory ‘/home/ski/Desktop/crunchbase-p1-machine-learning/files/output/’: File exists
mkdir: cannot create directory ‘/home/ski/Desktop/crunchbase-p1-machine-learning/files/output/network_sframes/’: File exists
mkdir: cannot create directory ‘/home/ski/Desktop/crunchbase-p1-machine-learning/files/output/network_sframes/p1’: File exists
mkdir: cannot create directory ‘/home/ski/Desktop/crunchbase-p1-machine-learning/files/output/network_sframes/cb’: File exists
mkdir: cannot create directory ‘/home/ski/Desktop/crunchbase-p1-machine-learning/files/output/network_sframes/model’: File exists
mkdir: cannot create directory ‘/home/ski/Desktop/crunchbase-p1-machine-learning/files/output/network_sframes/np1’: File exists
mkdir: cannot create directory ‘/home/ski/Desktop/crunchbase-p1-machine-learning/files/output/CrunchbaseGraphs/’: File exists
mkdir: cannot create d

# 2. Create mutliple merged pandaframes based on relationships using `network_by_date` function, which filters the dataframes by date to ensure the job/investment/company existed at that time.

In [4]:
date = '2020-09-08'
cb_frames,p1_frames = network_by_date(date, df, jobs, invest, invest_prtnr, model_uuids=[], skip_not_p1=True)


AS OF SEPTEMBER 08, 2020:

CaLcUlAtInG... FORMER NEW JOB
CaLcUlAtInG... PARTNER INVESTMENT JOB
CaLcUlAtInG... OTHER FIRM PARTNER JOBS & INVESTMENTS FILTER
CaLcUlAtInG... CURRENT OLD JOB FILTER
CaLcUlAtInG... EXTRA ORGANIZATION NODES

Crunchbase Neighborhood
NODES | OUTPUT FRAME 0/CB_companies (825393, 18)
NODES | OUTPUT FRAME 1/CB_investors (31499, 18)
NODES&EDGES | OUTPUT FRAME 2/CB_investments (453058, 17)
NODES&EDGES | OUTPUT FRAME 3/CB_investment_partners (89926, 18)
NODES&EDGES | OUTPUT FRAME 4/CB_jobs (395270, 12)
NODES&EDGES | OUTPUT FRAME 5/CB_jobs_former (182483, 12)
NODES&EDGES | OUTPUT FRAME 6/CB_jobs_former_new (299193, 12)
NODES&EDGES | OUTPUT FRAME 7/CB_jobs_partner (11771, 5)
NODES&EDGES | OUTPUT FRAME 8/CB_jobs_other_partners (351530, 12)
NODES&EDGES | OUTPUT FRAME 9/CB_invest_other_partners (155070, 18)
NODES&EDGES | OUTPUT FRAME 10/CB_jobs_current_old (66481, 12)
NODES | OUTPUT FRAME 11/CB_extra_org_nodes (191589, 19)

Pledge 1% Neighborhood
NODES | OUTPUT FRAME 0/P1

# 3. Write network dataframes to CSVs and load in as SFrames.

#### Save filtered dataframes as separate CSVs.

In [None]:
# Write to CSVs
for idx, frame in enumerate(cb_frames):
    print(f'CB: {idx+1} OUT OF {len(cb_frames)}')
    frame.to_csv(NETWORK_SFRAMES+'cb/{}_df.csv'.format(idx), index=False)
for idx, frame in enumerate(p1_frames):
    print(f'P1: {idx+1} OUT OF {len(cb_frames)}')
    frame.to_csv(NETWORK_SFRAMES+'p1/{}_df.csv'.format(idx), index=False)

#### FYI: START FROM HERE IF USING THE SAME DATE AS PREVIOUS RUNS.

In [5]:
# Loading SFrames
lst_of_frames = []
for val in ['cb','p1']:
    lst = []
    for idx in range(12):
        lst.append(SFrame(data=NETWORK_SFRAMES+'{}/{}_df.csv'.format(val, idx)))
    lst_of_frames.append(lst)
cb_sframes,p1_sframes = lst_of_frames

# List of Pledge 1% uuids for sampling
p1_companies_uuid = []
p1_companies_uuid.extend(list(p1_sframes[0]['uuid'].unique()))
p1_companies_uuid.extend(list(p1_sframes[1]['uuid'].unique()))
p1_companies_uuid = list(set(p1_companies_uuid))

CB: 1 OUT OF 12
CB: 2 OUT OF 12
CB: 3 OUT OF 12
CB: 4 OUT OF 12
CB: 5 OUT OF 12
CB: 6 OUT OF 12
CB: 7 OUT OF 12
CB: 8 OUT OF 12
CB: 9 OUT OF 12
CB: 10 OUT OF 12
CB: 11 OUT OF 12
CB: 12 OUT OF 12
P1: 1 OUT OF 12
P1: 2 OUT OF 12
P1: 3 OUT OF 12
P1: 4 OUT OF 12
P1: 5 OUT OF 12
P1: 6 OUT OF 12
P1: 7 OUT OF 12
P1: 8 OUT OF 12
P1: 9 OUT OF 12
P1: 10 OUT OF 12
P1: 11 OUT OF 12
P1: 12 OUT OF 12


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,float,str,str,str,str,str,float,str,str,str,str,int,str,float,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,float,str,str,str,str,str,float,str,str,str,str,int,str,float,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,str,str,str,str,str,str,str,float,float,float,float,str,int,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,str,str,str,str,str,str,str,float,float,float,float,str,int,str,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,str,str,str,str,str,str,str,int,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,str,str,str,str,str,str,str,int,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,str,str,str,str,str,str,str,int,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,str,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,str,str,str,str,str,str,str,int,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,str,str,str,str,str,str,str,float,str,float,float,str,int,str,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,str,str,str,str,str,str,str,int,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,float,str,str,str,str,str,float,str,str,str,str,int,str,float,str,int]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,float,str,str,str,str,str,float,str,str,str,str,int,str,float,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,float,str,str,str,str,str,float,str,str,str,str,int,str,float,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,str,str,str,str,str,str,str,float,float,float,float,str,int,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,str,str,str,str,str,str,str,float,float,float,float,str,int,str,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,str,str,str,str,str,str,str,int,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,str,str,str,str,str,str,str,int,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,str,str,str,str,str,str,str,int,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,str,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,str,str,str,str,str,str,str,int,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,str,str,str,str,str,str,str,float,float,float,float,str,int,str,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,str,str,str,str,str,str,str,int,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,float,str,str,str,str,str,float,str,str,str,str,int,str,float,str,int]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


# 4. Load SFrames into graph and remove duplicate edges.

#### Use functions to load in formatted SFrames into SGraph, `load_vertices`, `p1_affiliations`, and `load_edges` which feed into `make_graphs` user-defined methods. Then, remove duplicate edges.

#### Nodes: Person, Company, or Investor

    Node attributes: `__id`, `__node_type`, `name`, `p1_tag`

#### Edges: Investment, Job

    Edge attributes: `__src_id`, `__dst_id`, `__edge_type`, `status`, {`__id`}, {`investment_type`, `raised_amount_usd`, `investor_count`, `is_lead_investor`, `lead_investor_count`}, {`job_type`, `title`}

Reference: <a href='https://github.com/turi-code/how-to/blob/master/remove_duplicate_edges.py'>Remove duplicate edges from SGraph</a>

In [4]:
# Construct all 8 graph types
for weights_bool in [False, True]:
    for reverse_bool in [False, True]:
        for parallel_bool in [False, True]:
            cb = make_graph(cb_sframes, weights=weights_bool, reverse_edges=reverse_bool, remove_parallel_edges=parallel_bool)


BuIlDiNg GrApH...

Remove duplicates from Crunchbase graph

Node change: 1,290,346 --> 1,290,346
Edge change: 2,094,708 --> 1,948,405

PRIMARY Edge change: 938,254 --> 938,244
SECONDARY Edge change: 649,854 --> 572,265
TERTIARY Edge change: 506,600 --> 437,896

SAVING Cruncbase_1Way_MultiEdge: (1290346,1948405)
**************************************************

BuIlDiNg GrApH...
- REMOVING PARALLEL EDGES

Remove duplicates from Crunchbase graph

Node change: 1,290,346 --> 1,290,346
Edge change: 2,094,708 --> 981,877

PRIMARY Edge change: 938,254 --> 526,953
SECONDARY Edge change: 649,854 --> 306,231
TERTIARY Edge change: 506,600 --> 148,693

SAVING Cruncbase_1Way_SingleEdge: (1290346,981877)
**************************************************

BuIlDiNg GrApH...
- ADDING EDGES IN THE REVERSE DIRECTION

Remove duplicates from Crunchbase graph

Node change: 1,290,346 --> 1,290,346
Edge change: 4,189,416 --> 3,896,556

PRIMARY Edge change: 1,876,508 --> 1,876,400
SECONDARY Edge change: 1,

### Loading SGraphs
- `Cruncbase_1Way_MultiEdge`: Directed SGraph, one way, parallel edges (**MAIN GRAPH**)

- `Cruncbase_1Way_SingleEdge`: Directed SGraph, one way, **no parallel edges**

- `Crunchbase_2Ways_MultiEdge`: Directed SGraph, **two ways**, parallel edges (**WHEN NEEDED FOR FEATURE CALCULATIONS**)
    
- `Crunchbase_2Ways_SingleEdge`: Directed SGraph, two ways, **no parallel edges**
    
There here are 4 additional graphs with weights added! See complete list in code cell.

In [8]:
# Load
cb = load_sgraph(CRUNCHBASE_GRAPHS + 'Cruncbase_1Way_MultiEdge') # SELECTED
# cb = load_sgraph(CRUNCHBASE_GRAPHS + 'Crunchbase_2Ways_MultiEdge')
# cb = load_sgraph(CRUNCHBASE_GRAPHS + 'Cruncbase_1Way_SingleEdge')
# cb = load_sgraph(CRUNCHBASE_GRAPHS + 'Crunchbase_2Ways_SingleEdge')

# With Weights
# cb = load_sgraph(CRUNCHBASE_GRAPHS + 'Cruncbase_1Way_MultiEdge_Weighted')
# cb = load_sgraph(CRUNCHBASE_GRAPHS + 'Crunchbase_2Ways_MultiEdge_Weighted')
# cb = load_sgraph(CRUNCHBASE_GRAPHS + 'Cruncbase_1Way_SingleEdge_Weighted')
# cb = load_sgraph(CRUNCHBASE_GRAPHS + 'Crunchbase_2Ways_SingleEdge_Weighted')

# 5. Reduce size of dataset by limiting degrees of freedom from Pledge 1% companies.

Start by getting the vertex list from the entire Crunchbase network.

In [7]:
# Get subgraph vertices to sample from
cb_vertices = cb.get_vertices()

# Append investors + companies together into new SFrame
sample_vertices = cb_vertices[cb_vertices['__node_type']=='investor']
sample_vertices = sample_vertices.append(cb_vertices[cb_vertices['__node_type']=='company'])

# Save to CSV so you don't have to re-do this !
pd.DataFrame(sample_vertices).to_csv(SAMPLE_VERTICES+'ALL_CB_Pick_Sample_Companies_From_Here.csv', index=False)

#### Reduce the CB dataset down to 5,4,3,2 degrees away from Pledge 1%, and save all as CSV.

- Retrieve the graph neighborhood around a set of vertices, ignoring edge directions.
- Reference: <a href='https://apple.github.io/turicreate/docs/api/generated/turicreate.SGraph.get_neighborhood.html'>turicreate.SGraph.get_neighborhood</a>

In [13]:
# Define radii for calculating degrees of separation away from Pledge 1% companies
for rad in [2,3,4,5]:
    print(f'FOR RADIUS {rad}')
    
    # Create subgraph
    cb_smol = cb.get_neighborhood(ids=p1_companies_uuid, radius=rad, full_subgraph=True)

    # Save dictionaries which store info about graph
    before = cb.summary() # Full graph
    after = cb_smol.summary() # Subgraph

    # Output
    print('Radius of the neighborhood: {} degrees of separation from Pledge 1% companies uuids'.format(rad))
    print('Reduction in nodes: {:.2f}%'.format((1-(after['num_vertices']/before['num_vertices']))*100))
    print('Reduction in edges: {:.2f}%'.format((1-(after['num_edges']/before['num_edges']))*100))
    print('\nNode change: {:,} --> {:,}'.format(before['num_vertices'], after['num_vertices']))
    print('Edge change: {:,} --> {:,}'.format(before['num_edges'], after['num_edges']))
    print()
    
    # Get subgraph vertices to sample from
    cb_smol_vertices = cb_smol.get_vertices()

    # Append investors + companies together into new SFrame
    sample_vertices = cb_smol_vertices[cb_smol_vertices['__node_type']=='investor']
    sample_vertices = sample_vertices.append(cb_smol_vertices[cb_smol_vertices['__node_type']=='company'])

    # Save to CSV so you don't have to re-do this !
    pd.DataFrame(sample_vertices).to_csv(SAMPLE_VERTICES+'DEGREE_{}_Pick_Sample_Companies_From_Here.csv'.format(rad), index=False)

Radius of the neighborhood: 2 degrees of separation from Pledge 1% companies uuids
Reduction in nodes: 87.40%
Reduction in edges: 61.60%

Node change: 1,290,346 --> 162,579
Edge change: 3,896,556 --> 1,496,345


# 6. Produce all samples of the Crunchbase graphs. 10 for each scenario below for each neighborhood. Save to CSV.

#### Neighborhoods
- All of Crunchbase
- 5 degrees away from Pledge 1%
- 4 degrees away from Pledge 1%
- 3 degrees away from Pledge 1%
- 2 degrees away from Pledge 1%

#### Scenarios $\rightarrow$ 10 for each
1. Baseline reduced only
2. Baseline only
3. Graph only
4. Graph + Baseline reduced
5. Graph + Baseline

In [14]:
# Loading SFrames
lst_of_frames = []
for val in ['cb','p1']:
    lst = []
    for idx in range(12):
        lst.append(SFrame(data=NETWORK_SFRAMES+'{}/{}_df.csv'.format(val, idx)))
    lst_of_frames.append(lst)
cb_sframes,p1_sframes = lst_of_frames

# List of Pledge 1% uuids
p1_companies_uuid = list(p1_sframes[0]['uuid'].unique())
p1_companies_uuid.extend(list(p1_sframes[1]['uuid'].unique()))
p1_companies_uuid = list(set(p1_companies_uuid))
positive_labels = p1_companies_uuid

# Load CB Graphs
cb0 = load_sgraph(CRUNCHBASE_GRAPHS+'Cruncbase_1Way_MultiEdge')
cb1 = load_sgraph(CRUNCHBASE_GRAPHS+'Crunchbase_2Ways_MultiEdge')
cb2 = load_sgraph(CRUNCHBASE_GRAPHS+'Cruncbase_1Way_SingleEdge')
cb3 = load_sgraph(CRUNCHBASE_GRAPHS+'Crunchbase_2Ways_SingleEdge')

# Load CB Graphs With Weights
cb0w = load_sgraph(CRUNCHBASE_GRAPHS+'Cruncbase_1Way_MultiEdge_Weighted')
cb1w = load_sgraph(CRUNCHBASE_GRAPHS+'Crunchbase_2Ways_MultiEdge_Weighted')
cb2w = load_sgraph(CRUNCHBASE_GRAPHS+'Cruncbase_1Way_SingleEdge_Weighted')
cb3w = load_sgraph(CRUNCHBASE_GRAPHS+'Crunchbase_2Ways_SingleEdge_Weighted')

# Dataframe vertices from different Crunchbase graphs
ALL_vertices = pd.read_csv(SAMPLE_VERTICES+'ALL_CB_Pick_Sample_Companies_From_Here.csv')
DEGREE_5_vertices = pd.read_csv(SAMPLE_VERTICES+'/DEGREE_5_Pick_Sample_Companies_From_Here.csv')
DEGREE_4_vertices = pd.read_csv(SAMPLE_VERTICES+'DEGREE_4_Pick_Sample_Companies_From_Here.csv')
DEGREE_4_vertices = pd.read_csv(SAMPLE_VERTICES+'DEGREE_4_Pick_Sample_Companies_From_Here.csv')
DEGREE_3_vertices = pd.read_csv(SAMPLE_VERTICES+'DEGREE_3_Pick_Sample_Companies_From_Here.csv')
DEGREE_2_vertices = pd.read_csv(SAMPLE_VERTICES+'DEGREE_2_Pick_Sample_Companies_From_Here.csv')

# Setting up loops
neighborhoods_name = ['Model_DF_D2', 'Model_DF_D3', 'Model_DF_D4', 'Model_DF_D5', 'Model_DF_ALL']
neighborhoods = [DEGREE_2_vertices, DEGREE_3_vertices, DEGREE_4_vertices, DEGREE_5_vertices, ALL_vertices]
neighborhoods_dict = dict(zip(neighborhoods_name,neighborhoods))

------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,float,str,str,str,str,str,float,str,str,str,str,int,str,float,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,float,str,str,str,str,str,float,str,str,str,str,int,str,float,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,str,str,str,str,str,str,str,float,float,float,float,str,int,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,str,str,str,str,str,str,str,float,float,float,float,str,int,str,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,str,str,str,str,str,str,str,int,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,str,str,str,str,str,str,str,int,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,str,str,str,str,str,str,str,int,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,str,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,str,str,str,str,str,str,str,int,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,str,str,str,str,str,str,str,float,str,float,float,str,int,str,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,str,str,str,str,str,str,str,int,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,float,str,str,str,str,str,float,str,str,str,str,int,str,float,str,int]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,float,str,str,str,str,str,float,str,str,str,str,int,str,float,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,float,str,str,str,str,str,float,str,str,str,str,int,str,float,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,str,str,str,str,str,str,str,float,float,float,float,str,int,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,str,str,str,str,str,str,str,float,float,float,float,str,int,str,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,str,str,str,str,str,str,str,int,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,str,str,str,str,str,str,str,int,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,str,str,str,str,str,str,str,int,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,str,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,str,str,str,str,str,str,str,int,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,str,str,str,str,str,str,str,float,float,float,float,str,int,str,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,str,str,str,str,str,str,str,int,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,float,str,str,str,str,str,float,str,str,str,str,int,str,float,str,int]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


## Create baseline scenario CSVs for each neighborhood

####  The code below is for Baseline Reduced (`BR`) & Baseline (`B`) scenarios, which require no graph feature calculations.

In [37]:
for neighborhood in neighborhoods_name:
    for scenario in ['B', 'BR']:
        for idx in range(10):
            
            # Retrieve vertex dataframe
            DF = neighborhoods_dict[neighborhood]
            
            # Sample equal size of non-P1 companies from vertices dataframe
            negatives_labels = DF.sample(int(len(positive_labels)), replace=False)['__id'].to_list()
        
            # Combine, avoid duplicates
            model_labels = list(np.unique(positive_labels + negatives_labels))
            
            # Reduce to sample CSV
            smol_DF = DF[['__id']][DF['__id'].isin(model_labels)].reset_index(drop=True).rename({'__id':'uuid'}, axis=1)
            
            # Output to CSV
            path = MODEL_CSVS+'{}/{}/{}.csv'.format(neighborhood,scenario,idx)
            smol_DF.to_csv(path, index=False)
            print(f'SAVING to {path}')

SAVING to /home/ski/Desktop/crunchbase-p1-machine-learning/files/output/models/Model_DF_D2/B/0.csv
SAVING to /home/ski/Desktop/crunchbase-p1-machine-learning/files/output/models/Model_DF_D2/B/1.csv
SAVING to /home/ski/Desktop/crunchbase-p1-machine-learning/files/output/models/Model_DF_D2/B/2.csv
SAVING to /home/ski/Desktop/crunchbase-p1-machine-learning/files/output/models/Model_DF_D2/B/3.csv
SAVING to /home/ski/Desktop/crunchbase-p1-machine-learning/files/output/models/Model_DF_D2/B/4.csv
SAVING to /home/ski/Desktop/crunchbase-p1-machine-learning/files/output/models/Model_DF_D2/B/5.csv
SAVING to /home/ski/Desktop/crunchbase-p1-machine-learning/files/output/models/Model_DF_D2/B/6.csv
SAVING to /home/ski/Desktop/crunchbase-p1-machine-learning/files/output/models/Model_DF_D2/B/7.csv
SAVING to /home/ski/Desktop/crunchbase-p1-machine-learning/files/output/models/Model_DF_D2/B/8.csv
SAVING to /home/ski/Desktop/crunchbase-p1-machine-learning/files/output/models/Model_DF_D2/B/9.csv
SAVING to 

SAVING to /home/ski/Desktop/crunchbase-p1-machine-learning/files/output/models/Model_DF_ALL/B/3.csv
SAVING to /home/ski/Desktop/crunchbase-p1-machine-learning/files/output/models/Model_DF_ALL/B/4.csv
SAVING to /home/ski/Desktop/crunchbase-p1-machine-learning/files/output/models/Model_DF_ALL/B/5.csv
SAVING to /home/ski/Desktop/crunchbase-p1-machine-learning/files/output/models/Model_DF_ALL/B/6.csv
SAVING to /home/ski/Desktop/crunchbase-p1-machine-learning/files/output/models/Model_DF_ALL/B/7.csv
SAVING to /home/ski/Desktop/crunchbase-p1-machine-learning/files/output/models/Model_DF_ALL/B/8.csv
SAVING to /home/ski/Desktop/crunchbase-p1-machine-learning/files/output/models/Model_DF_ALL/B/9.csv
SAVING to /home/ski/Desktop/crunchbase-p1-machine-learning/files/output/models/Model_DF_ALL/BR/0.csv
SAVING to /home/ski/Desktop/crunchbase-p1-machine-learning/files/output/models/Model_DF_ALL/BR/1.csv
SAVING to /home/ski/Desktop/crunchbase-p1-machine-learning/files/output/models/Model_DF_ALL/BR/2.c

### Methods for computing Graph features

#### Pagerank
- The pagerank.create() method computes the pagerank for each vertex and returns a PagerankModel. The pagerank value indicates the centrality of each node in the graph.
- Compute the PageRank for each vertex in the graph. Return a model object with total PageRank as well as the PageRank value for each vertex in the graph.
- Reference: <a href='https://apple.github.io/turicreate/docs/api/generated/turicreate.pagerank.create.html#turicreate.pagerank.create'>turicreate.pagerank.create</a>

#### Shortest path
- Compute the single source shortest path distance from the source vertex to all vertices in the graph. Note that because SGraph is directed, shortest paths are also directed. To find undirected shortest paths add edges to the SGraph in both directions. Return a model object with distance each of vertex in the graph.
- Reference: <a href='https://apple.github.io/turicreate/docs/api/generated/turicreate.shortest_path.create.html#turicreate.shortest_path.create'>turicreate.shortest_path.create</a>

#### K-core decomposition
- Compute the K-core decomposition of the graph. Return a model object with total number of cores as well as the core id for each vertex in the graph.
- Reference: <a href='https://apple.github.io/turicreate/docs/api/generated/turicreate.kcore.create.html'>turicreate.kcore.create</a>

#### Degree counting
- Compute the in degree, out degree and total degree of each vertex.
- Reference: <a href='https://apple.github.io/turicreate/docs/api/generated/turicreate.degree_counting.create.html#turicreate.degree_counting.create'>turicreate.degree_counting.create</a>

#### Triangle Counting
- Compute the number of triangles each vertex belongs to, ignoring edge directions. A triangle is a complete subgraph with only three vertices. Return a model object with total number of triangles as well as the triangle counts for each vertex in the graph.
- Reference: <a href='https://apple.github.io/turicreate/docs/api/generated/turicreate.triangle_counting.create.html#turicreate.triangle_counting.create'>turicreate.triangle_counting.create</a>



## Create graph scenario CSVs for each neighborhood

####  The code below is for Graph only (`G`), Graph & Baseline (`GB`), Graph & Baseline Reduced (`GBR`) scenarios.

In [None]:
# Turicreate
turicreate.config.set_runtime_config('TURI_DEFAULT_NUM_GRAPH_LAMBDA_WORKERS', 96)

# Fields needed
sgraph_idx = {0:'cb0w',1:'cb1w',2:'cb2w', 3:'cb3w'} # Only needed weighted versions
sgraph_idx_inv = {v:k for (k,v) in sgraph_idx.items()} # For saving the right column name

# List of graphs used in loop
list_of_graphs = [cb0w,cb1w,cb2w,cb3w]

# Coordinating -- for loading in graphs
feat_graph_map = {'pagerank':['cb0w', 'cb1w', 'cb2w', 'cb3w'],
                  'pagerank_weight':['cb0w', 'cb1w', 'cb2w', 'cb3w'],
                  'kcore':['cb2w', 'cb3w'], # Number of edges does not matter, single edge
                  'degree':['cb0w', 'cb1w', 'cb2w', 'cb3w'], # Doesn't require a lot of computational power
                  'triangle':['cb0w', 'cb2w'], # Ignores edge directions, 1-way
                  'shortest':['cb1w', 'cb3w'],  # Requires bi-directional edges
                  'shortest_weight':['cb1w', 'cb3w']} # Requires bi-directional edges

for neighborhood in neighborhoods_name: # 2 times
    for scenario in ['G','GB','GBR']: # 3 times
        for idx in range(1,10): # 10 times
            print('*'*50)
            print('{} | {} | {}'.format(neighborhood,scenario,idx))
            print('*'*50)
            # Grab neighborhood DF to start with
            DF = neighborhoods_dict[neighborhood]
            # Sample equal size of non-P1 companies from vertices dataframe
            negatives_labels = DF.sample(int(len(positive_labels)), replace=False)['__id'].to_list()
            # Combine, avoid duplicates
            model_labels = list(np.unique(positive_labels + negatives_labels))
            # SEND TO GRAPH FEATURE METHOD WHICH: CREATES GRAPH FOR FEATURE & APPENDS FEATURE TO MODEL DATAFRAME
            smol_DF = feature_creation(model_labels, list_of_graphs, p1_companies_uuid)
            # Output to CSV
            path = MODEL_CSVS+'{}/{}/{}.csv'.format(neighborhood,scenario,idx)
            smol_DF.to_csv(path, index=False)
            print('SAVING to {}\n'.format(path))

**************************************************
Model_DF_D2 | G | 1
**************************************************
Creating graph CB0W
HERE_PR
['__id', 'pr_0']
HERE_PR_W
Iteration 0: total pagerank changed in L1 = 307826.082546
Iteration 1: total pagerank changed in L1 = 75742.167535
Iteration 2: total pagerank changed in L1 = 74902.446321
Weighted pagerank finished in: 49.795091 secs
['__id', 'w_pr_0']
HERE_SP
['__id']
HERE_SP_W
['__id']
HERE_KC
['__id']
HERE_D
['__id', 'in_deg_0', 'out_deg_0']
HERE_T
['__id', 'tri_0']
Creating graph CB1W
HERE_PR
['__id', 'pr_1']
HERE_PR_W
Iteration 0: total pagerank changed in L1 = 212271.521795
Iteration 1: total pagerank changed in L1 = 9434.232451
Iteration 2: total pagerank changed in L1 = 0.000000
Weighted pagerank finished in: 80.512814 secs
['__id', 'w_pr_1']
HERE_SP
1
2
3
4
5
['__id', 'spath_top_1_0', 'spath_top_1_1', 'spath_top_1_2', 'spath_top_1_3', 'spath_top_1_4', 'spath_top_min_1']
HERE_SP_W
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16


['__id', 'w_spath_top_1_0', 'w_spath_top_1_1', 'w_spath_top_1_2', 'w_spath_top_1_3', 'w_spath_top_1_4', 'w_spath_top_min_1']
HERE_KC
['__id']
HERE_D
['__id', 'in_deg_1', 'out_deg_1']
HERE_T
['__id']
Creating graph CB2W
HERE_PR
['__id', 'pr_2']
HERE_PR_W
Iteration 0: total pagerank changed in L1 = 310585.479842
Iteration 1: total pagerank changed in L1 = 78469.831748
Iteration 2: total pagerank changed in L1 = 77540.547805
Weighted pagerank finished in: 45.213636 secs
['__id', 'w_pr_2']
HERE_SP
['__id']
HERE_SP_W
['__id']
HERE_KC
['__id', 'kc_2']
HERE_D
['__id', 'in_deg_2', 'out_deg_2']
HERE_T
['__id', 'tri_2']
Creating graph CB3W
HERE_PR
['__id', 'pr_3']
HERE_PR_W
Iteration 0: total pagerank changed in L1 = 216775.455550
Iteration 1: total pagerank changed in L1 = 10431.754011
Iteration 2: total pagerank changed in L1 = 0.000000
Weighted pagerank finished in: 67.786341 secs
['__id', 'w_pr_3']
HERE_SP
1
2
3
4
5
6
7
8
9
['__id', 'spath_top_3_0', 'spath_top_3_1', 'spath_top_3_2', 'spath_t

Iteration 2: total pagerank changed in L1 = 0.000000
Weighted pagerank finished in: 69.628080 secs
['__id', 'w_pr_3']
HERE_SP
1
2
3
4
5
6
7
['__id', 'spath_top_3_0', 'spath_top_3_1', 'spath_top_3_2', 'spath_top_3_3', 'spath_top_3_4', 'spath_top_min_3']
HERE_SP_W
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
['__id', 'w_spath_top_3_0', 'w_spath_top_3_1', 'w_spath_top_3_2', 'w_spath_top_3_3', 'w_spath_top_3_4', 'w_spath_top_min_3']
HERE_KC
['__id', 'kc_3']
HERE_D
['__id', 'in_deg_3', 'out_deg_3']
HERE_T
['__id']
DATAFRAME SHAPE: (10340, 45)
SAVING to /home/ski/Desktop/crunchbase-p1-machine-learning/files/output/model_csvs/Model_DF_D2/G/7.csv

**************************************************
Model_DF_D2 | G | 8
**************************************************
Creating graph CB0W
HERE_PR
['__id', 'pr_0']
HERE_PR_W
Iteration 0: total pagerank changed in L1 = 308708.591566
Iteration 1: total pagerank changed in L1 = 76323.052097
Iteration 2: total pagerank changed in L1 = 75485.7320

Iteration 0: total pagerank changed in L1 = 308761.759348
Iteration 1: total pagerank changed in L1 = 76692.924504
Iteration 2: total pagerank changed in L1 = 75847.950882
Weighted pagerank finished in: 57.735590 secs
['__id', 'w_pr_0']
HERE_SP
['__id']
HERE_SP_W
['__id']
HERE_KC
['__id']
HERE_D
['__id', 'in_deg_0', 'out_deg_0']
HERE_T
['__id', 'tri_0']
Creating graph CB1W
HERE_PR
['__id', 'pr_1']
HERE_PR_W


### FYI: Old code from previous graph feature testing...

In [32]:
# # Fields needed for this function
# lst_of_graphs = [cb0,cb1,cb2,cb3,cb0w,cb1w,cb2w,cb3w]
# sgraph_idx_assign = {0:'cb0',1:'cb1',2:'cb2',3:'cb3',0:'cb0',1:'cb1',2:'cb2',3:'cb3'}
# vertex_type_list = ['cb_smol_ALL', 'cb_smol_D5', 'cb_smol_D4', 'cb_smol_D3', 'cb_smol_D2']
# model_uuids_dict = {v:[] for v in vertex_type_list}

# def make_smol_sgraphs(positive_labels, vertex_df, string, SGraph_list, radius=3):
    
#     # Sample equal size of non-P1 companies from vertices dataframe
#     negatives_labels = vertex_df.sample(int(len(positive_labels)), replace=False)['__id'].to_list()
        
#     # Combine, avoid duplicates
#     model_labels = list(np.unique(positive_labels + negatives_labels))

#     for idx,graph in enumerate(SGraph_list):
            
#         # Create subgraph
#         print('Creating graph {}'.format(sgraph_idx_assign[idx].upper()))
#         smol = graph.get_neighborhood(ids=model_labels, radius=radius, full_subgraph=True)   
            
#         # Save subgraph
#         path = 'ModelGraphs/test/{}_{}'.format(string,sgraph_idx_assign[idx])
#         smol.save(path)
#         print('SAVING to {}\n'.format(path))
        
#     # Output model labels for this set of graphs
#     return model_labels

# model_labels = make_smol_sgraphs(positive_labels, ALL_vertices, 'cb_smol_ALL', lst_of_graphs, radius=3)
# model_uuids_dict['cb_smol_ALL'] = model_labels

# model_labels = make_smol_sgraphs(positive_labels, DEGREE_5_vertices, 'cb_smol_D5',lst_of_graphs, radius=3)
# model_uuids_dict['cb_smol_D5'] = model_labels

# model_labels = make_smol_sgraphs(positive_labels, DEGREE_4_vertices, 'cb_smol_D4',lst_of_graphs, radius=3)
# model_uuids_dict['cb_smol_D4'] = model_labels

# model_labels = make_smol_sgraphs(positive_labels, DEGREE_3_vertices, 'cb_smol_D3',lst_of_graphs, radius=3)
# model_uuids_dict['cb_smol_D3'] = model_labels

# model_labels = make_smol_sgraphs(positive_labels, DEGREE_2_vertices, 'cb_smol_D2', lst_of_graphs, radius=3)
# model_uuids_dict['cb_smol_D2'] = model_labels

In [39]:
# # Coordinating -- for loading in graphs
# vertex_type_list = ['cb_smol_ALL', 'cb_smol_D4','cb_smol_D2']
# feat_graph_map = {'pagerank':['cb0','cb1','cb2','cb3'], 
#                   'kcore':['cb0','cb1','cb2','cb3'],
#                   'degree':['cb0','cb1'], 
#                   'triangle':['cb0','cb1'],
#                   'shortest':['cb1', 'cb3'], 
#                   'shortest_weight':['cb1w', 'cb3w']}
# vertex_df_map = {v:pd.DataFrame(columns=['__id']) for v in vertex_type_list}

# from turicreate import pagerank
# from functools import reduce

# # Mapping for this function
# sgraph_idx_assign = {0:'cb0',1:'cb1',2:'cb2',3:'cb3'}

# if not len(sgraph_idx_assign.items())==len(feat_graph_map['pagerank']):
#     print('THE ASSIGNMENT DOES NOT MATCH NUMBER OF GRAPHS')

# for vertex_type in vertex_type_list:
#     lst_of_frames = []
#     for idx,smol in enumerate(feat_graph_map['pagerank']):
#         print('CaLcUlAtInG pAgeRaNk for graph {}, in graph neighborhood {}'.format(sgraph_idx_assign[idx].upper(),vertex_type.upper()))
#         path = 'ModelGraphs/test/{}_{}'.format(vertex_type,smol)
#         graph = load_sgraph(path)
#         pr = pagerank.create(graph, verbose=False)
#         pr_sframe = pr['pagerank']

#         # Modifying output SFrame
#         pr_df = pd.DataFrame(pr_sframe)
#         pr_df = pr_df.drop('delta', axis=1)
#         pr_df = pr_df[pr_df['__id'].isin(model_uuids_dict[vertex_type])].reset_index(drop=True)
#         pr_df = pr_df.rename({'pagerank':'pr_{}'.format(idx)}, axis=1)
        
#         # Save to temp lst_of_frames
#         lst_of_frames.append(pr_df)
    
#     PR_DF = reduce(lambda df1,df2: pd.merge(df1,df2,on='__id'), lst_of_frames)
#     vertex_df_map[vertex_type] = pd.merge(vertex_df_map[vertex_type], PR_DF, on='__id', how='outer')
    
#################################################################################
# from turicreate import kcore
# # Mapping for this function
# sgraph_idx_assign = {0:'cb0',1:'cb1',2:'cb2',3:'cb3'}

# if not len(sgraph_idx_assign.items())==len(feat_graph_map['kcore']):
#     print('THE ASSIGNMENT DOES NOT MATCH NUMBER OF GRAPHS')

# for vertex_type in vertex_type_list:
#     lst_of_frames = []
#     for idx,smol in enumerate(feat_graph_map['kcore']):
#         print('CaLcUlAtInG kCoRe for graph {}, in graph neighborhood {}'.format(sgraph_idx_assign[idx].upper(),vertex_type.upper()))
#         path = 'ModelGraphs/test/{}_{}'.format(vertex_type, smol)
#         graph = load_sgraph(path)
#         kc = kcore.create(graph, kmin=0, kmax=10, verbose=False)
#         kc_sframe = kc['core_id'] 
        
#         # Modifying output SFrame
#         kc_df = pd.DataFrame(kc_sframe)
#         kc_df = kc_df[kc_df['__id'].isin(model_uuids_dict[vertex_type])].reset_index(drop=True)
#         kc_df = kc_df.rename({'core_id':'kc_{}'.format(idx)}, axis=1)
        
#         # Save to temp lst_of_frames
#         lst_of_frames.append(kc_df)
    
#     KC_DF = reduce(lambda df1,df2: pd.merge(df1,df2,on='__id'), lst_of_frames)
#     vertex_df_map[vertex_type] = pd.merge(vertex_df_map[vertex_type], KC_DF, on='__id', how='outer')

#################################################################################
# from turicreate import degree_counting
# # Mapping for this function
# sgraph_idx_assign = {0:'cb0',1:'cb1'}

# if not len(sgraph_idx_assign.items())==len(feat_graph_map['degree']):
#     print('THE ASSIGNMENT DOES NOT MATCH NUMBER OF GRAPHS')

# for vertex_type in vertex_type_list:
#     lst_of_frames = []
#     for idx,smol in enumerate(feat_graph_map['degree']):
#         print('CaLcUlAtInG dEgReEs for graph {}, in graph neighborhood {}'.format(sgraph_idx_assign[idx].upper(),vertex_type.upper()))
#         path = 'ModelGraphs/test/{}_{}'.format(vertex_type, smol)
#         graph = load_sgraph(path)
#         deg = degree_counting.create(graph)
#         deg_sgraph = deg['graph'] 
#         deg_df = pd.DataFrame(deg_sgraph.vertices[['__id', 'in_degree', 'out_degree']])
#         deg_df = deg_df[deg_df['__id'].isin(model_uuids_dict[vertex_type])].reset_index(drop=True)
#         deg_df = deg_df.rename({'in_degree':'in_deg_{}'.format(idx),
#                              'out_degree':'out_deg_{}'.format(idx)}, axis=1)
#         # Save to temp lst_of_frames
#         lst_of_frames.append(deg_df)
#     DEG_DF = reduce(lambda df1,df2: pd.merge(df1,df2,on='__id'), lst_of_frames)
#     vertex_df_map[vertex_type] = pd.merge(vertex_df_map[vertex_type], DEG_DF, on='__id', how='outer')
    
#################################################################################
# from turicreate import triangle_counting
# # Mapping for this function
# sgraph_idx_assign = {0:'cb0', 1:'cb1'}

# if not len(sgraph_idx_assign.items())==len(feat_graph_map['triangle']):
#     print('THE ASSIGNMENT DOES NOT MATCH NUMBER OF GRAPHS')
    
# for vertex_type in vertex_type_list:
#     lst_of_frames = []
#     for idx,smol in enumerate(feat_graph_map['triangle']):
#         print('CaLcUlAtInG TrIaNgLeS for graph {}, in graph neighborhood {}'.format(sgraph_idx_assign[idx].upper(),vertex_type.upper()))
#         path = 'ModelGraphs/test/{}_{}'.format(vertex_type, smol)
#         graph = load_sgraph(path)
#         tc = triangle_counting.create(graph, verbose=False)
#         tri_df = pd.DataFrame(tc['triangle_count'])
#         tri_df = tri_df[tri_df['__id'].isin(model_uuids_dict[vertex_type])].reset_index(drop=True)
#         tri_df = tri_df.rename({'triangle_count':'tri_{}'.format(idx)},axis=1)
#         # Save to temp lst_of_frames
#         lst_of_frames.append(tri_df)
#     TRI_DF = reduce(lambda df1,df2: pd.merge(df1,df2,on='__id'), lst_of_frames)
#     vertex_df_map[vertex_type] = pd.merge(vertex_df_map[vertex_type], TRI_DF, on='__id', how='outer')
    
#################################################################################
# # Mapping for this function
# sgraph_idx_assign = {0:'cb1',1:'cb3'}
# sgraph_idx_jdx_assign = {0:1, 1:3}

# if not len(sgraph_idx_assign.items())==len(feat_graph_map['shortest']):
#     print('THE ASSIGNMENT DOES NOT MATCH NUMBER OF GRAPHS')
    
# for vertex_type in vertex_type_list:
#     lst_of_frames = []

#     for idx,smol in enumerate(feat_graph_map['shortest']):
#         print('CaLcUlAtInG sHoRtEsT PaTh tOP P1 for graph {}, in graph neighborhood {}'.format(sgraph_idx_assign[idx].upper(),vertex_type.upper()))
#         path = 'ModelGraphs/test/{}_{}'.format(vertex_type, smol)
#         graph = load_sgraph(path)
#         pr = vertex_df_map[vertex_type][['__id', 'pr_{}'.format(sgraph_idx_jdx_assign[idx])]].sort_values(by='pr_{}'.format(sgraph_idx_jdx_assign[idx]),ascending=False)
#         pr = pr['__id'].to_list()[:200]
#         count = 0
#         top_p1 = []
#         while len(top_p1) < 5:
#             if pr[count] in p1_companies_uuid:
#                 top_p1.append(pr[count])
#             count += 1
#         lst_of_lst_of_frames = []
#         for jdx,uuid in enumerate(top_p1):
#             sp = shortest_path.create(graph, source_vid=uuid, verbose=False)
#             sp_df = pd.DataFrame(sp['distance'])
#             sp_df = sp_df[sp_df['__id'].isin(model_uuids_dict[vertex_type])].reset_index(drop=True)
#             sp_df = sp_df.rename({'distance': 'spath_top_{}_{}'.format(sgraph_idx_jdx_assign[idx],jdx)}, axis=1)
#             lst_of_lst_of_frames.append(sp_df)
#         sp_df = reduce(lambda df1,df2: pd.merge(df1,df2,on='__id'), lst_of_lst_of_frames)
#         sp_df['spath_top_min_{}'.format(sgraph_idx_jdx_assign[idx])] = sp_df.min(axis=1) 
#         lst_of_frames.append(sp_df)

#     DIST_DF = reduce(lambda df1,df2: pd.merge(df1,df2,on='__id'), lst_of_frames)
#     vertex_df_map[vertex_type] = pd.merge(vertex_df_map[vertex_type], DIST_DF, on='__id', how='outer')
    
#################################################################################
# from turicreate import shortest_path

# # Mapping for this function
# sgraph_idx_assign = {0:'cb1w',1:'cb3w'}
# sgraph_idx_jdx_assign = {0:1, 1:3}

# if not len(sgraph_idx_assign.items())==len(feat_graph_map['shortest']):
#     print('THE ASSIGNMENT DOES NOT MATCH NUMBER OF GRAPHS')
    
# for vertex_type in vertex_type_list:
#     lst_of_frames = []

#     for idx,smol in enumerate(feat_graph_map['shortest_weight']):
#         print('CaLcUlAtInG sHoRtEsT PaTh tOP P1 for graph {}, in graph neighborhood {}'.format(sgraph_idx_assign[idx].upper(),vertex_type.upper()))
#         path = 'ModelGraphs/test/{}_{}'.format(vertex_type, smol)
#         graph = load_sgraph(path)
#         pr = vertex_df_map[vertex_type][['__id', 'pr_{}'.format(sgraph_idx_jdx_assign[idx])]].sort_values(by='pr_{}'.format(sgraph_idx_jdx_assign[idx]),ascending=False)
#         pr = pr['__id'].to_list()[:200]
#         count = 0
#         top_p1 = []
#         while len(top_p1) < 5:
#             if pr[count] in p1_companies_uuid:
#                 top_p1.append(pr[count])
#             count += 1
#         lst_of_lst_of_frames = []
#         for jdx,uuid in enumerate(top_p1):
#             sp = shortest_path.create(graph, source_vid=uuid, weight_field='weight', verbose=False)
#             sp_df = pd.DataFrame(sp['distance'])
#             sp_df = sp_df[sp_df['__id'].isin(model_uuids_dict[vertex_type])].reset_index(drop=True)
#             sp_df = sp_df.rename({'distance': 'w_spath_top_{}_{}'.format(sgraph_idx_jdx_assign[idx],jdx)}, axis=1)
#             lst_of_lst_of_frames.append(sp_df)
#         sp_df = reduce(lambda df1,df2: pd.merge(df1,df2,on='__id'), lst_of_lst_of_frames)
#         sp_df['w_spath_top_min_{}'.format(sgraph_idx_jdx_assign[idx])] = sp_df.min(axis=1) 
#         lst_of_frames.append(sp_df)

#     DIST_DF = reduce(lambda df1,df2: pd.merge(df1,df2,on='__id'), lst_of_frames)
#     vertex_df_map[vertex_type] = pd.merge(vertex_df_map[vertex_type], DIST_DF, on='__id', how='outer')

#################################################################################
# # Weighted pagerank
# # Mapping for this function
# sgraph_idx_assign = {0:'cb1w',1:'cb2w', 2:'cb3w', 3:'cb4w'}
# if not len(sgraph_idx_assign.items())==len(feat_graph_map['pagerank_weight']):
#     print('THE ASSIGNMENT DOES NOT MATCH NUMBER OF GRAPHS')
# for vertex_type in vertex_type_list:
#     lst_of_frames = []
#     for idx,smol in enumerate(feat_graph_map['pagerank_weight']):
#         print('CaLcUlAtInG wEiGhTeD pAgeRaNk for graph {}, in graph neighborhood {}'.format(sgraph_idx_assign[idx].upper(),vertex_type.upper()))
#         path = 'ModelGraphs/test/{}_{}'.format(vertex_type,smol)
#         graph = load_sgraph(path)
#         pr_w = pagerank_weighted(graph)
#         pr_w_sframe = pr_w['__id', 'pagerank']
#         # Modifying output SFrame
#         pr_w_df = pd.DataFrame(pr_w_sframe)
#         pr_w_df = pr_w_df[pr_w_df['__id'].isin(model_uuids_dict[vertex_type])].reset_index(drop=True)
#         pr_w_df = pr_w_df.rename({'pagerank_weight':'w_pr_{}'.format(idx)}, axis=1)
#         # Save to temp lst_of_frames
#         lst_of_frames.append(pr_w_df)
#     PR_W_DF = reduce(lambda df1,df2: pd.merge(df1,df2,on='__id'), lst_of_frames)
#     vertex_df_map[vertex_type] = pd.merge(vertex_df_map[vertex_type], PR_W_DF, on='__id', how='outer')

CaLcUlAtInG pAgeRaNk for graph CB0, in graph neighborhood CB_SMOL_ALL
CaLcUlAtInG pAgeRaNk for graph CB1, in graph neighborhood CB_SMOL_ALL
CaLcUlAtInG pAgeRaNk for graph CB2, in graph neighborhood CB_SMOL_ALL
CaLcUlAtInG pAgeRaNk for graph CB3, in graph neighborhood CB_SMOL_ALL
CaLcUlAtInG pAgeRaNk for graph CB0, in graph neighborhood CB_SMOL_D4
CaLcUlAtInG pAgeRaNk for graph CB1, in graph neighborhood CB_SMOL_D4
CaLcUlAtInG pAgeRaNk for graph CB2, in graph neighborhood CB_SMOL_D4
CaLcUlAtInG pAgeRaNk for graph CB3, in graph neighborhood CB_SMOL_D4
CaLcUlAtInG pAgeRaNk for graph CB0, in graph neighborhood CB_SMOL_D2
CaLcUlAtInG pAgeRaNk for graph CB1, in graph neighborhood CB_SMOL_D2
CaLcUlAtInG pAgeRaNk for graph CB2, in graph neighborhood CB_SMOL_D2
CaLcUlAtInG pAgeRaNk for graph CB3, in graph neighborhood CB_SMOL_D2
CaLcUlAtInG kCoRe for graph CB0, in graph neighborhood CB_SMOL_ALL
CaLcUlAtInG kCoRe for graph CB1, in graph neighborhood CB_SMOL_ALL
CaLcUlAtInG kCoRe for graph CB2, i