# Obtain data from JSON files

In [1]:
import pandas as pd

In [2]:
from process_json import get_data

In [3]:
# Data directory containing the json files
data_directory = '../data/json_files'

# Output directory to store results
output_directory = '../results'
# File containining information about facilities
facilities_filename = 'osh_facilities'
# File containining information about contributors
contributors_filename = 'osh_contributors'    

In [None]:
# Get data about facilities and contributors,
# and save them to csv files in output directory
get_data(data_directory, output_directory,
         facilities_filename, contributors_filename)

## Display data about facilities

In [4]:
df = pd.read_csv(output_directory + '/' + facilities_filename + '.csv', sep='\t').fillna('')

In [5]:
df.head()

Unnamed: 0,os_id,facility_name,address,country_code,country_name,lat,lng,is_closed
0,VN20222495PXV5H,HUU THANH HOUSEHOLD,"A5/140B, Hamlet 1, Tan Nhut commune, Binh Chan...",VN,Vietnam,10.738181,106.536036,
1,CN20213347PTKW6,"\t\nRushan HempFortex Industries Co., Ltd.","TOWN WEIHAI CITY, DAGUSHAN, Shandong, 264507",CN,China,36.919551,121.636611,
2,BD20222851G47VJ,\tAl-Karam Towel Industries (Pvt.) Ltd. Unit-II,\t\nپلاٹ#ڈی-18 سائٹ سپر ہائی وے اسکیم #3,BD,Bangladesh,23.684994,90.356331,
3,CN2021252KMAV3E,"\tFUJIAN CHANGYUAN TEXTILE CO., LTD.","Hunan District, Airport Industrial Zone, Hunan...",CN,China,41.714914,123.449714,
4,VN2022293X36514,\tMaxport No 5 - Nam Dinh Branch,"Highway 10, Loc Vuong, Nam Dinh",VN,Vietnam,20.444519,106.159501,


## Display data about contributors

In [6]:
df = pd.read_csv(output_directory + '/' + contributors_filename + '.csv', sep='\t').fillna('')

In [7]:
df.head()

Unnamed: 0,contributor_id,contributor_name,os_id,supplier_name,contribution_date,address,number_of_workers_min,number_of_workers_max,facility_type,processing_type,parent_company,product_type
0,848.0,HONG SHENG SHOES COMPANY LTD,CN2021256J1YK18,HONG SHENG SHOES COMPANY LTD,2022-10-28,"GEHAI INDUSTRIAL ZONE YANBU TOWN, NANHAI DISTR...",,,,,,
1,848.0,HONG SHENG SHOES COMPANY LTD,CN2021256J1YK18,HONG SHENG SHOES COMPANY LTD,2022-10-28,"GEHAI INDUSTRIAL ZONE YANBU TOWN, NANHAI DISTR...",,,,,,
2,2483.0,M.C.K INTERNATIONAL LIMITED,CN2019085FN2EDB,Hangzhou Luke Shoes Co. Ltd.,2022-10-28,"Jingyou Village, Puyang Town,Xiaoshan District...",,,,,,
3,2483.0,M.C.K INTERNATIONAL LIMITED,CN2020009GJW6QS,Wenzhou Aoliwei Shoes Co. Ltd.,2022-02-08,"NO.58,Xingping Road, Puzhong Street, Longwan D...",,,,,,
4,2483.0,M.C.K INTERNATIONAL LIMITED,CN2020148BR342E,Hangzhou Zhongpu Shoes Co. Ltd.,2022-02-08,"Anshan Village,Puyang Town,Xiaoshan District,H...",,,,,,


# Obtain data via API

In [8]:
import requests
import urllib.parse
import json
import numpy as np
import pandas as pd

In [9]:
from process_api import get_data

In [10]:
# set where you want to save the data 
output_directory = "../results"
# filename
filename = 'summary_os_ids.tsv.gz'

In [11]:
# define parameters (these come from the website, make sure that they are up to date before running the code)
contributor_types = ['Academic / Researcher / Journalist / Student', 
                    'Auditor / Certification Scheme / Service Provider', 
                    'Brand / Retailer', 
                    'Civil Society Organization', 
                    'Facility / Factory / Manufacturing Group / Supplier / Vendor', 
                    'Multi-Stakeholder Initiative', 
                    'Union', 
                    'Other']

parms = {"detail": "true",
        "format": "json",
        "page": 1,
        "pageSize": 100} #100 seems to be the largest possible pageSize
        
url = "https://opensupplyhub.org/api/facilities-downloads/?"

# This session id will be expired! 
# Find your own session ID by logging in OpenSupplyHub and looking at the cookies of any request in the headers tab
cookies = {
    'sessionid': 'dxx8c5e1i844rcl4x0yuiz79hd4lvrv5',
}

In [None]:
get_data(contributor_types, parms, url, cookies, data_folder, filename)

## Display data

In [14]:
df = pd.read_csv(output_directory + '/' + filename, sep='\t', low_memory=False).fillna('')

In [15]:
df.head()

Unnamed: 0,os_id,contribution_date,name,address,country_code,country_name,lat,lng,sector,contributor (list),number_of_workers,parent_company,processing_type_facility_type_raw,facility_type,processing_type,product_type,is_closed,contributor_type
0,BD2020212VEDNJC,2020-07-30,2T's Creation,"Plot 1241 (3rd Floor), Begum Rokeya Sarani, Ea...",BD,Bangladesh,23.8006254,90.371022,Apparel,PPE: Mapped in Bangladesh (PPE: Mapped in Bang...,,,,,,,False,Academic / Researcher / Journalist / Student
1,BD2020212VEDNJC,2022-05-16,,,,,,,Apparel,BRAC University (Mapped in Bangladesh: Export ...,,,,,,,,Academic / Researcher / Journalist / Student
2,BD2020212VEDNJC,2022-05-16,,,,,,,Apparel,An Academic / Researcher / Journalist / Studen...,,,,,,,,Academic / Researcher / Journalist / Student
3,BD2020212VEDNJC,2021-11-29,,,,,,,Apparel,BRAC University (API),,,,,,,,Academic / Researcher / Journalist / Student
4,BD2019248GNVQ6X,2019-09-05,3-A Fashions Ltd.,"Madrasha Road, Khejur Bagan, Ashulia, Savar, D...",BD,Bangladesh,23.8909633,90.329906,Apparel,BRAC University (Mapped in Bangladesh: Export-...,,,,,,,False,Academic / Researcher / Journalist / Student
