In [None]:
import pandas as pd
import json

In [None]:
import glob
import pickle

In [None]:
#Get list of files from the data directory containing the json files
files = glob.glob('../data/json_files/*')

In [None]:
'''
Add basic fields which can be provided by anonymous as well as non-anonymous contributors
'''

def add_basic_data_fields(data, facility, contributor):
    
    data.update({
        'brand': contributor['name'], #Brand name
        'os_id': facility['properties']['os_id'], # OS ID
        'name': facility['properties']['name'], #Name of supplier
        'address': facility['properties']['address'], # Address of supplier
        'country_code': facility['properties']['country_code'], # Country code of supplier
        'country_name': facility['properties']['country_name'], #Country name of supplier
        'lat': facility['geometry']['coordinates'][1], #Latitude of supplier
        'lng': facility['geometry']['coordinates'][0], # Longitude of supplier
        'is_closed': facility['properties']['is_closed'], #Is the supplier now closed?
    })

'''
Extracting more information from extended fields
This seems to be optional information,
and can only be provided by non-anonymous contributors.
Some suppliers have provided this information, some haven't,
some have provided conflicting information.
'''

def add_extended_data_fields(data, facility, contributor):
    
    #Date when the entry for this supplier was last updated by this brand.
    #Using the date when the brand last updated the supplier's name
    for entry in facility['properties']['extended_fields']['name']:   
        if entry['contributor_id'] == data['contributor_id']:
            data['contribution_date'] = entry['updated_at'].split('T')[0]

    #Minimum and maximum number of workers
    for entry in facility['properties']['extended_fields']['number_of_workers']:   
        if entry['contributor_id'] == data['contributor_id']:
            data['number_of_workers_min'] = entry['value']['min']
            data['number_of_workers_max'] = entry['value']['max']

    #Parent company of the supplier
    for entry in facility['properties']['extended_fields']['parent_company']:   
        if entry['contributor_id'] == data['contributor_id']:
            if 'name' in entry['value'].keys():
                data['parent_company'] = entry['value']['name']

    #Facility type of the supplier
    for entry in facility['properties']['extended_fields']['facility_type']:   
        if entry['contributor_id'] == data['contributor_id']:
            if 'raw_values' in entry['value'].keys():
                data['facility_type'] = entry['value']['raw_values']      

    #Processing type of the supplier
    for entry in facility['properties']['extended_fields']['processing_type']:   
        if entry['contributor_id'] == data['contributor_id']:
            if 'raw_values' in entry['value'].keys():
                data['processing_type'] = entry['value']['raw_values']  

    #Product type of the supplier
    for entry in facility['properties']['extended_fields']['product_type']:   
        if entry['contributor_id'] == data['contributor_id']:
            if 'raw_values' in entry['value'].keys():
                data['product_type'] = ', '.join(entry['value']['raw_values'])          

In [None]:
#List containing all entries of the dataframe. 
#Each row of the dataframe contains data for a unique (brand, supplier) pair
alldata = []

#List of os_ids. Each os_id corresponds to one supplier.
os_ids = []

#Go through all json files in the directory
for filename in files:

    #Read one json file
    f = open(filename, 'r')
    manyfacilities = json.load(f)    
    
    #Read one entry in the json file
    for facility in manyfacilities['features']:
        
        # export may contain duplicates
        if facility['properties']['os_id'] not in os_ids:
            os_ids.append(facility['properties']['os_id'])
            
            #List of contributors (brands) which provided data for this supplier (corresponding to this os_id)
            contributor_ids = []
            
            for contributor in facility['properties']['contributors']:
                
                #Non-anonymous contributors which have an assigned id
                if 'id' in contributor.keys():

                    #Prevent brands being listed multiple times for the same supplier
                    if contributor['id'] not in contributor_ids:
                        contributor_ids.append(contributor['id'])
                        
                        data = {
                        'contributor_id': contributor['id'], #Not asked for, but useful later in code
                        }
                        
                        add_basic_data_fields(data, facility, contributor) # Add basic data fields
                        add_extended_data_fields(data, facility, contributor) #Add data from extended fields
                        
                #Anonymous contributors whithout an assigned id
                if 'id' not in contributor.keys():
                        
                        data = {} #Initialize data dictionary
                        
                        add_basic_data_fields(data, facility, contributor) # Add basic data fields                                                
                        
                #Append entry for this (brand, supplier) pair to the master list
                alldata.append(data)

In [None]:
#List containing all entries of the dataframe. 
#Each row of the dataframe contains data for a unique (brand, supplier) pair
alldata = []
 
#List of os_ids. Each os_id corresponds to one supplier.
os_ids = []

#Go through all json files in the directory
for filename in files:

    #Read one json file
    f = open(filename, 'r')
    manyfacilities = json.load(f)    
    
    #Read one entry in the json file
    for facility in manyfacilities['features']:
        
        # export may contain duplicates
        if facility['properties']['os_id'] not in os_ids:
            os_ids.append(facility['properties']['os_id'])
            
            #List of contributors (brands) which provided data for this supplier (corresponding to this os_id)
            contributor_ids = []
            
            for contributor in facility['properties']['contributors']:
                
                #Choose only non-anonymous contributors which have an assigned id number
                if 'id' in contributor.keys():

                    #Prevent brands being listed multiple times for the same supplier
                    if contributor['id'] not in contributor_ids:
                        contributor_ids.append(contributor['id'])

                        data = {
                        'brand': contributor['name'], #Brand name
                        'contributor_id': contributor['id'], #Not asked for, but useful later in code
                        'os_id': facility['properties']['os_id'], # OS ID
                        'name': facility['properties']['name'], #Name of supplier
                        'address': facility['properties']['address'], # Address of supplier
                        'country_code': facility['properties']['country_code'], # Country code of supplier
                        'country_name': facility['properties']['country_name'], #Country name of supplier
                        'lat': facility['geometry']['coordinates'][1], #Latitude of supplier
                        'lng': facility['geometry']['coordinates'][0], # Longitude of supplier
                        'is_closed': facility['properties']['is_closed'], #Is the supplier now closed?
                        }
                        
                        #Extracting more information from extended fields.
                        #This seems to be optional information:
                        #Some suppliers have provided this information, some haven't,
                        #some have provided conflicting information.
                        
                        #Date when the entry for this supplier was last updated by this brand.
                        #Using the date when the brand last updated the supplier's name
                        for entry in facility['properties']['extended_fields']['name']:   
                            if entry['contributor_id'] == data['contributor_id']:
                                data['contribution_date'] = entry['updated_at'].split('T')[0]
                        
                        #Minimum and maximum number of workers
                        for entry in facility['properties']['extended_fields']['number_of_workers']:   
                            if entry['contributor_id'] == data['contributor_id']:
                                data['number_of_workers_min'] = entry['value']['min']
                                data['number_of_workers_max'] = entry['value']['max']

                        #Parent company of the supplier
                        for entry in facility['properties']['extended_fields']['parent_company']:   
                            if entry['contributor_id'] == data['contributor_id']:
                                if 'name' in entry['value'].keys():
                                    data['parent_company'] = entry['value']['name']
                        
                        #Facility type of the supplier
                        for entry in facility['properties']['extended_fields']['facility_type']:   
                            if entry['contributor_id'] == data['contributor_id']:
                                if 'raw_values' in entry['value'].keys():
                                    data['facility_type'] = entry['value']['raw_values']      
                                    
                        #Processing type of the supplier
                        for entry in facility['properties']['extended_fields']['processing_type']:   
                            if entry['contributor_id'] == data['contributor_id']:
                                if 'raw_values' in entry['value'].keys():
                                    data['processing_type'] = entry['value']['raw_values']  

                        #Product type of the supplier
                        for entry in facility['properties']['extended_fields']['product_type']:   
                            if entry['contributor_id'] == data['contributor_id']:
                                if 'raw_values' in entry['value'].keys():
                                    data['product_type'] = ', '.join(entry['value']['raw_values'])  
                        
                        #These variables in the desired data template don't exist in the json files:                        
                        #'sector': 'Apparel' (?),
                        #'contributor_list': facility['properties']['contributors']['list_name'](?),  
                        #'processing_type_facility_type_raw': facility['properties']['facility_type']['values']['raw_value'] (?),

                        #Append entry for this (brand, supplier) pair to the master list
                        alldata.append(data)

In [None]:
#Convert to pandas dataframe
df = pd.DataFrame(alldata)
#Replace all NaN entries by blank spaces
df = df.fillna('')
#Sort the dataframe by brand
df_sorted = df.sort_values('brand')

In [None]:
#Write sorted dataframe to csv file
#Tab-separated since the address field has commas
df_sorted.to_csv('../results/osh.csv', index=False, sep='\t')