In [6]:
import requests
import urllib.parse
import json
import numpy as np
import pandas as pd

In [7]:
# set where you want to save the data 
data_folder = "../"

In [8]:
# define parameters (these come from the website, make sure that they are up to date before running the code)
contributor_types = ['Academic / Researcher / Journalist / Student', 
                    'Auditor / Certification Scheme / Service Provider', 
                    'Brand / Retailer', 
                    'Civil Society Organization', 
                    'Facility / Factory / Manufacturing Group / Supplier / Vendor', 
                    'Multi-Stakeholder Initiative', 
                    'Union', 
                    'Other']

parms = {"detail": "true",
        "format": "json",
        "page": 1,
        "pageSize": 100} #100 seems to be the largest possible pageSize
        
url = "https://opensupplyhub.org/api/facilities-downloads/?"

# This session id will be expired! 
# Find your own session ID by logging in OpenSupplyHub and looking at the cookies of any request in the headers tab
cookies = {
    'sessionid': 'wyjvuahmuhe7oojo9nccptajgii4adk4',
}

In [4]:
def parse_request(r, cont_type):
    """
    Parses json request from the OpenSupplyHub API into a Pandas DataFrame.

    Parameters
    ----------
    r : requests.Response
        A response object returned by a HTTP request.
    cont_type : str
        A string representing the type of contributor.

    Returns
    -------
    tuple
        - A Pandas DataFrame containing the parsed data.
        - The next page URL as a string.

    Raises
    ------
    Exception
        If there is a problem with the HTTP request or format of the data.


    Notes
    -----
    This function assumes that the JSON response contains a "results" object with
    "headers" and "rows" sub-objects. It also assumes that empty values in the DataFrame
    are represented by empty strings, which are replaced with NaN values. Finally, it
    drops any columns that are entirely composed of NaN values.
    """

    if r.ok:
        response = json.loads(r.text)
        headers = response["results"]["headers"]
        data = response["results"]["rows"]

        df = pd.DataFrame(data, columns=headers).replace("", np.nan).dropna(how="all", axis=1)
        df["contributor_type"] = cont_type
        return df, response["next"]
    else:        
        print(r.url)
        raise("Problem with request")
        


In [9]:
# Get all data
df_all = []
for cont_type in contributor_types:
    print(f"\nStarting {cont_type}")
    parms.update({"contributor_types": cont_type, "page": 1})
    url_form = url + urllib.parse.urlencode(parms)
    i = 0
    while url_form is not None:
        # track
        print(i, end=":")
        i += 1
        # get json
        r = requests.get(url_form,  cookies=cookies)
        # parse json
        df, url_form = parse_request(r, cont_type)
        df_all.append(df)

df = pd.concat(df_all)
df = df.drop_duplicates()
df.head()


Starting Other
0:1:2:3:4:5:6:7:8:9:10:11:12:13:14:15:16:17:18:19:20:21:22:23:24:25:26:27:28:29:30:31:32:33:34:35:36:37:38:39:40:41:42:43:44:45:46:47:48:49:50:51:52:53:54:55:56:57:58:59:60:61:62:63:64:65:66:67:68:69:70:71:72:73:74:75:76:77:78:79:80:81:82:83:84:85:86:87:88:89:90:91:92:93:94:95:96:97:98:99:100:101:102:103:104:105:106:107:108:109:110:111:112:113:114:115:116:117:118:119:120:121:122:123:124:125:126:127:128:129:130:131:132:133:134:135:136:137:138:139:140:141:142:143:144:145:146:147:148:149:150:151:152:153:154:155:156:157:158:159:160:161:162:163:164:165:166:167:168:169:170:171:172:173:174:175:176:177:178:179:180:181:182:183:184:185:186:187:188:189:190:191:192:193:194:195:196:197:198:199:200:201:202:203:204:205:206:207:208:209:210:211:212:213:214:215:216:217:218:219:220:221:222:223:224:225:226:227:228:229:230:231:232:233:234:235:236:237:238:239:240:241:242:243:244:245:246:247:248:249:250:251:252:253:254:255:256:257:258:259:260:261:262:263:264:265:266:267:268:269:270:271:272:27

Unnamed: 0,os_id,contribution_date,name,address,country_code,country_name,lat,lng,sector,contributor (list),number_of_workers,parent_company,processing_type_facility_type_raw,facility_type,processing_type,product_type,is_closed,contributor_type
0,BD2020212VEDNJC,2020-07-30,2T's Creation,"Plot 1241 (3rd Floor), Begum Rokeya Sarani, Ea...",BD,Bangladesh,23.800625,90.371022,Apparel,PPE: Mapped in Bangladesh (PPE: Mapped in Bang...,,,,,,,False,Academic / Researcher / Journalist / Student
1,BD2020212VEDNJC,2022-05-16,,,,,,,Apparel,BRAC University (Mapped in Bangladesh: Export ...,,,,,,,,Academic / Researcher / Journalist / Student
2,BD2020212VEDNJC,2022-05-16,,,,,,,Apparel,An Academic / Researcher / Journalist / Studen...,,,,,,,,Academic / Researcher / Journalist / Student
3,BD2020212VEDNJC,2022-05-16,,,,,,,Apparel,An Academic / Researcher / Journalist / Studen...,,,,,,,,Academic / Researcher / Journalist / Student
4,BD2020212VEDNJC,2021-11-29,,,,,,,Apparel,BRAC University (API),,,,,,,,Academic / Researcher / Journalist / Student


In [17]:
# check number of companies
print(len(df), len(df["os_id"].unique()))

# check one company
df.loc[df["os_id"]=="ET2021328W50W5D"]

982522 137757


Unnamed: 0,os_id,contribution_date,name,address,country_code,country_name,lat,lng,sector,contributor (list),number_of_workers,parent_company,processing_type_facility_type_raw,facility_type,processing_type,product_type,is_closed,contributor_type
483,ET2021328W50W5D,2021-11-24,Best International Garments PLC,"Shed #38 & 41, Hawassa Industrial Park, Hawass...",ET,Ethiopia,7.050374,38.495504,Apparel,A Brand / Retailer (List),,,,,,,False,Multi-Stakeholder Initiative
484,ET2021328W50W5D,2022-12-05,,,,,,,Apparel,A Brand / Retailer (List),3628.0,MARQUIS IMPEX PTE LTD,,,,,,Multi-Stakeholder Initiative
485,ET2021328W50W5D,2022-10-21,,,,,,,Unspecified,Fair Factories Clearinghouse (FFC Factory List...,,,,,,,,Multi-Stakeholder Initiative
486,ET2021328W50W5D,2022-08-02,,,,,,,Apparel,A Brand / Retailer (List),,,,,,,,Multi-Stakeholder Initiative
487,ET2021328W50W5D,2021-12-30,,,,,,,Apparel,A Brand / Retailer (List),,,,,,,,Multi-Stakeholder Initiative


In [18]:
df.loc[df.duplicated(["os_id","contribution_date"], keep=False)].sort_values(by=["os_id","contribution_date"]).head(10)

Unnamed: 0,os_id,contribution_date,name,address,country_code,country_name,lat,lng,sector,contributor (list),number_of_workers,parent_company,processing_type_facility_type_raw,facility_type,processing_type,product_type,is_closed,contributor_type
590,AE2019085TSEJJ4,2019-03-26,MILLENNIUM FASHIONS IND,"C2-7, Saif Zone, Sharjah, 8250",AE,United Arab Emirates,25.332358,55.483166,Apparel,A Brand / Retailer (List),,,,,,,False,Brand / Retailer
373,AE2019085TSEJJ4,2019-03-26,MILLENNIUM FASHIONS IND,"C2-7, Saif Zone, Sharjah, 8250",AE,United Arab Emirates,25.332358,55.483166,Apparel,A Brand / Retailer (List),,,,,,,False,Civil Society Organization
598,AE2019085TSEJJ4,2020-11-03,,,,,,,Apparel,A Brand / Retailer (List),,,,,,,,Brand / Retailer
381,AE2019085TSEJJ4,2020-11-03,,,,,,,Apparel,A Brand / Retailer (List),,,,,,,,Civil Society Organization
597,AE2019085TSEJJ4,2021-04-27,,,,,,,Apparel,A Brand / Retailer (List),,,,,,,,Brand / Retailer
380,AE2019085TSEJJ4,2021-04-27,,,,,,,Apparel,A Brand / Retailer (List),,,,,,,,Civil Society Organization
596,AE2019085TSEJJ4,2021-11-10,,,,,,,Apparel,A Brand / Retailer (List),,,,,,,,Brand / Retailer
379,AE2019085TSEJJ4,2021-11-10,,,,,,,Apparel,A Brand / Retailer (List),,,,,,,,Civil Society Organization
595,AE2019085TSEJJ4,2022-03-22,,,,,,,Apparel,The WikiRate Project e.V. (API),98.0,,,,,,,Brand / Retailer
378,AE2019085TSEJJ4,2022-03-22,,,,,,,Apparel,The WikiRate Project e.V. (API),98.0,,,,,,,Civil Society Organization


In [20]:
# save data
df.to_csv(f"{data_folder}/summary_os_ids.tsv.gz", compression="gzip", sep="\t", index=None)