In [1]:
import pandas as pd
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt

In [2]:
BATTERY_PARTS = [
    '281700','282200','392020','392051','392119','392310', 
    '420212','721240','722230','722699','730120','730690', 
    '732619','732620','740822','740919','740921','741011', 
    '741220','741533','750522','750610','750620','760612', 
    '760719','790700','810590','831110','831120','831190', 
    '850110','850120','850590','850640','850660','850730', 
    '850780','853223','853340','854190','854370','854519', 
    '854720','860900','391910','391990','392069','731822', 
    '854449','391732','280430','290123','292390','281122', 
    '390230','853223','854720','760410','740990','853223', 
    '854720','760410','740990','282300','284210','740811', 
    '853630','382499','761699','830249','854411','848049', 
    '850790','732690','280440','854290','731816','830230', 
    '392099','401699','730711','741999','850680','853222', 
    '390210','382490','854442','282110','853890','853610', 
    '850450','732599','392690','730890','731100','853641'
]

BATTERY = '850760'

# Get Top Suppliers of Battery

In [91]:
index = pd.read_csv("s3://supply-web-data-storage/CSV/index_hs6.csv")

In [92]:
# Check for missing values
index[index['hs6'] == BATTERY].isna().sum()

supplier_t      0
supplier_id     0
buyer_t         1
buyer_id        1
hs6             0
st              0
et              0
bill_count      0
quantity_sum    0
weight_sum      0
amount_sum      0
dtype: int64

In [117]:
def generate_battery_suppliers(index, method_name=None, count=10, df_print=False):   
    if method_name is not None:
        top_suppliers_df = index[index['hs6'] == BATTERY].groupby('supplier_t').agg(
                               indicator_value=(method_name, 'sum')
                           ).sort_values(
                               ["indicator_value"], ascending = False
                           )
    else:
        raise ValueError('Must specify a method for defining largest supplier.')
    if df_print: 
        print(top_suppliers_df.head(count))
    top_suppliers = set(top_suppliers_df.head(count).index)
    assert len(top_suppliers) == count
    return top_suppliers

In [119]:
battery_suppliers = generate_top_suppliers(index, method_name="bill_count", df_print=True)
battery_suppliers

                                                 indicator_value
supplier_t                                                      
dell global b.v                                             6060
wistron corporation                                         3845
sevt                                                        2540
hp international sarl                                       2298
yiwu yihuang import export compan                            826
sehc                                                         689
samsung electronics vietnam thai nguyen co.,ltd              658
cong ty tnhh dien tu samsung hcmc ce complex                 631
verdant crest technology dmcc                                508
luxshare precision limited                                   437


{'cong ty tnhh dien tu samsung hcmc ce complex',
 'dell global b.v',
 'hp international sarl',
 'luxshare precision limited',
 'samsung electronics vietnam thai nguyen co.,ltd',
 'sehc',
 'sevt',
 'verdant crest technology dmcc',
 'wistron corporation',
 'yiwu yihuang import export compan'}

# BFS on a supplier

In [227]:
def construct_battery_subgraph(suppliers, tiers, as_nx=False, nx_kwargs=None):
    assert tiers >= 1
    bom_df = index[index['hs6'].isin(BATTERY_PARTS + [BATTERY])].copy()
    bom_df = bom_df.loc[:,['supplier_t','buyer_t','hs6', 'bill_count']]

    # Find suppliers by tier
    new_suppliers = set(suppliers)
    all_suppliers = set(suppliers) 
    all_suppliers_dict = dict.fromkeys(suppliers, 1) # maps supplier to tier 
    for t in range(2, tiers + 1):
        df = bom_df[bom_df['supplier_t'].isin(new_suppliers)].copy()
        new_suppliers  = set(df[df.buyer_t.str.len() > 0].buyer_t.unique())
        new_suppliers = new_suppliers - all_suppliers
        all_suppliers = all_suppliers.union(new_suppliers)
        all_suppliers_dict.update(dict.fromkeys(new_suppliers, t))
        
    subgraph_df = bom_df[bom_df['supplier_t'].isin(all_suppliers)].copy()
    assert all(subgraph_df.supplier_t.str.len() > 0)
    assert all(subgraph_df.buyer_t.str.len() > 0)
    subgraph_df["tier"] = subgraph_df.apply(
        lambda row: all_suppliers_dict[row.supplier_t], 
        axis=1
    )
    return subgraph_df.sort_values(['tier', 'supplier_t'], ascending = True)

In [232]:
for bat in battery_suppliers:
    print(bat, len(construct_battery_subgraph([bat], 5)))
construct_battery_subgraph(['dell global b.v'], 5)

sehc 764
cong ty tnhh dien tu samsung hcmc ce complex 808
wistron corporation 25
dell global b.v 46
hp international sarl 46
verdant crest technology dmcc 30
samsung electronics vietnam thai nguyen co.,ltd 27
luxshare precision limited 274
sevt 174
yiwu yihuang import export compan 2


Unnamed: 0,supplier_t,buyer_t,hs6,bill_count,tier
38190,dell global b.v,bordertrade management inc,850760,13,1
38191,dell global b.v,bridges worldwide pvt ltd,850760,1,1
38192,dell global b.v,dell international services india private limited,850760,5990,1
38193,dell global b.v,fao dell global b.v,850760,23,1
38194,dell global b.v,gie solomon,850760,3,1
38196,dell global b.v,intel technology india private limited,761699,2,1
38199,dell global b.v,intel technology india private limited,850440,1,1
38201,dell global b.v,intel technology india private limited,854442,2,1
38203,dell global b.v,petroleum general distribution services joint ...,850760,1,1
38204,dell global b.v,vietnam post,850760,1,1


# Analysis

In [None]:
# Q1: as tier increases, when does the graph die? (trend: tier vs num_new_row)

In [None]:
# Q2: 'hs6' vs tier as tree depth mean (TODO). If BOM correct, the empirical plot should be consistent

In [None]:
# Q3: compare and contrast different suppliers