In [85]:
import pandas as pd
import json
data_loc = '/Users/Sage/PycharmProjects/Data-Projects/Coffee FlavorsxOrigin/data/coffee_data.json'

def load_data(data_loc, data):
    with open(data_loc, 'r') as infile:
        data = json.load(infile)
            

def save_data(data):
    with open(data_loc, 'w') as outfile:
        json.dump(data, outfile, sort_keys=True, indent=4)

Let's start by doing some light data cleaning leading into some light EDA. 
This will be done by taking a look at the scraped coffee names and inserting country data into the data JSON documents.

In [50]:
with open(data_loc, 'r') as f:
    data = json.load(f)

In [76]:
countries = [x.split()[0] for x in data.keys()]

In [77]:
countries = list(set(countries))
countries

['Java',
 'Flores',
 'Guatemala',
 'Papua',
 'Roasted',
 'Indonesia',
 'Sumatra',
 'El',
 'Espresso',
 'Yemen',
 'Colombia',
 'Brazil',
 'Zambia',
 'Winter',
 'Sulawesi',
 'Timor',
 'Timorindo',
 'India',
 'Congo',
 'Panama',
 'ROASTED',
 'Tanzania',
 'Cameroon',
 'Nicaragua',
 'Burundi',
 'Peru',
 'Sweet',
 'Ethiopia',
 'Rwanda',
 'Costa',
 'Kenya',
 'Mexico',
 'Honduras']

Since this list of distinct first words mostly contains countries (with a few issues), the next step is to clean up this list so we can insert the countries.

In [78]:
countries = [x for x in countries if x not in ('ROASTED', 'Sweet', 'Winter','Roasted')]

In [79]:
countries.append('El Salvador')
countries.remove('El')
countries.append('Costa Rica')
countries.remove('Costa')

In [80]:
print(countries)

['Java', 'Flores', 'Guatemala', 'Papua', 'Indonesia', 'Sumatra', 'Espresso', 'Yemen', 'Colombia', 'Brazil', 'Zambia', 'Sulawesi', 'Timor', 'Timorindo', 'India', 'Congo', 'Panama', 'Tanzania', 'Cameroon', 'Nicaragua', 'Burundi', 'Peru', 'Ethiopia', 'Rwanda', 'Kenya', 'Mexico', 'Honduras', 'El Salvador', 'Costa Rica']


Looks much better. Next step is to insert these into each document where available.

In [81]:
for k, v in data.items():
    country_guess = k.split()[:2]
    
    if country_guess[0] in countries:
        v['origin'] = country_guess[0]
    elif (new_guess := ' '.join(country_guess)) in countries:
        v['origin'] = new_guess
    else:
        v['origin'] = None

In [82]:
origins = []
for k, v in data.items():
    if v['origin'] not in origins:
        origins.append(v['origin'])
print(origins)

['Brazil', 'Burundi', 'Cameroon', 'Colombia', 'Congo', 'Costa Rica', 'El Salvador', 'Espresso', 'Ethiopia', 'Flores', 'Guatemala', 'Honduras', 'India', 'Indonesia', 'Java', 'Kenya', 'Mexico', 'Nicaragua', 'Panama', 'Papua', 'Peru', None, 'Rwanda', 'Sulawesi', 'Sumatra', 'Tanzania', 'Timor', 'Timorindo', 'Yemen', 'Zambia']


Looks good. Let's take a look at those None values and see if there is anything we can do about them at this time...

In [96]:
for k, v in data.items():
    if v['origin'] is None:
        print(k)

Sweet Maria's Ethiopiques Version 2.0
Sweet Maria's Half-Caff Blend
Sweet Maria's Moka Java SWP Decaf Blend
Sweet Maria's Moka Kadir Blend
Sweet Maria's Polar Expresso Holiday Blend


Since we are specifically looking at green coffees, not roasted coffees, let's remove the 5 roasted entires from the dataset. We also already have an Ethiopia Honey Genji entry, so we can delete that as well.

We can leave the Sweet Maria's blend coffees with a None value for the origin, so we only need to manually deal with the Winter Special entry and the espresso workshop entries.

In [89]:
del data['ROASTED COFFEE El Salvador Apaneca Finca Miravalle']
del data['ROASTED COFFEE Zambia Dry Process Kateshi Estate ']
del data['ROASTED ESPRESSO Liquid Amber Blend']
del data['ROASTED ESPRESSO Workshop The Skullet']
del data['Roasted Coffee Subscription']
del data['Winter Special: Ethiopia Honey Genji']
del data['Espresso Workshop #49 Balayage']
del data['Espresso Workshop #50 - The Skullet']

Next, let's create a boolean value for Decaf in case we need it later.

In [87]:
for k, v in data.items():
    if 'Decaf' in k:
        v['decaf'] = True
    else:
        v['decaf'] = False

Finally, let's save the data and start looking into cupping and flavor scores.

In [99]:
save_data(data)