In [None]:
!pip install openfoodfacts

In [10]:
import pandas as pd
import openfoodfacts
from collections import Counter
import boto3
import json
import sys
import psycopg2

## Extract

In [4]:
# Read data from openfoodfacts and create df
ids = [737628064502,3017620422003,5449000131805,3175680011534,8000500310427,3228857000166,3229820782560,5410188031072,5010477348630,3068320114453,3088543506255,3033490506629,7622210476104,5000112611878,3228021170022,5411188119098,3073781115345,3252210390014,20724696,8076809513753,87157239,7622300441937,5053990156009,20916435]
ids = [str(x) for x in ids]

categories = ["id", "categories_hierarchy", "generic_name", "nutriscore_score", "quantity", "origins", "Allergens"]

df = pd.DataFrame(columns=categories)

for product_id in ids:
    product = openfoodfacts.products.get_product(product_id)
    values = list( map(product.get("product").get, categories) )
    df.loc[len(df)] = values
    
df.head()

Unnamed: 0,id,categories_hierarchy,generic_name,nutriscore_score,quantity,origins,Allergens
0,737628064502,"[en:plant-based-foods-and-beverages, en:plant-...",Rice Noodles,4,155 g,Thailand,
1,3017620422003,"[en:Petit-déjeuners, en:Produits à tartiner, e...",,26,400 g,,
2,5449000131805,"[en:beverages, en:carbonated-drinks, en:artifi...","Boisson rafraîchissante aux extraits végétaux,...",1,330 ml,,
3,3175680011534,"[en:snacks, en:breakfasts, en:sweet-snacks, en...",,9,230 g,"France,European Union,Non European Union",
4,8000500310427,"[en:snacks, en:sweet-snacks, en:biscuits-and-c...",,25,304g,,


In [5]:
# Create new DF with splitted categories and counter 
category_counter = Counter(df['categories_hierarchy'].explode())
category_counter_df= pd.DataFrame.from_dict(category_counter, orient='index').reset_index().rename(columns={"index": "category", 0: "amount"})
category_counter_df.head()

Unnamed: 0,category,amount
0,en:plant-based-foods-and-beverages,8
1,en:plant-based-foods,7
2,en:cereals-and-potatoes,4
3,en:cereals-and-their-products,3
4,en:pastas,1


In [6]:
# Find and Add min and max to new DF
aggr_df = df.explode("categories_hierarchy").groupby("categories_hierarchy").agg({"nutriscore_score": ['min', 'max']}).xs('nutriscore_score', axis=1, drop_level=True).reset_index()
result_df = aggr_df.rename(columns={"categories_hierarchy": "category", "min": "minimum", "max": "maximum"}).merge(category_counter_df, on="category")
result_df = result_df.rename(columns={"category": "generic_name", "minimum":"min_nutr_score", "maximum":"max_nutr_score"})
result_df.head()

Unnamed: 0,generic_name,min_nutr_score,max_nutr_score,amount
0,de:Kekse mit Nuss-Nugat-Creme-Füllung,25,25,1
1,en:Biscuit-snack-with-chocolate-filling,9,9,1
2,en:Groceries,13,18,2
3,en:Mandeln,-3,-3,1
4,en:Nüsse,-3,-3,1


In [21]:
result_df.to_csv("data/food_data.csv", index=False)

## Load into Postgres

In [8]:
def get_secret(secret_name, region_name="us-east-1"):
    session = boto3.session.Session()
    client = session.client(
        service_name='secretsmanager',
        region_name=region_name)
    get_secret_value_response = client.get_secret_value(SecretId=secret_name)
    get_secret_value_response = json.loads(get_secret_value_response['SecretString'])
    return get_secret_value_response

In [9]:
creds = get_secret("wysde")
USERNAME = creds["RDS_POSTGRES_USERNAME"]
PASSWORD = creds["RDS_POSTGRES_PASSWORD"]
HOST = creds["RDS_POSTGRES_HOST"]
DATABASE = 'sparsh'

conn_str = 'postgresql://{0}:{1}@{2}/{3}'.format(USERNAME, PASSWORD, HOST, DATABASE)

In [23]:
TABLE_NAME = "food_data"

In [26]:
def connect(conn_str):
    """ Connect to the PostgreSQL database server """
    conn = None
    try:
        print('Connecting to the PostgreSQL database...')
        conn = psycopg2.connect(conn_str)
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        sys.exit(1)
    print("Connection successful")
    return conn

In [41]:
conn = connect(conn_str)

Connecting to the PostgreSQL database...
Connection successful


In [24]:
df = pd.read_csv(f"./data/{TABLE_NAME}.csv")
df

Unnamed: 0,generic_name,min_nutr_score,max_nutr_score,amount
0,de:Kekse mit Nuss-Nugat-Creme-Füllung,25,25,1
1,en:Biscuit-snack-with-chocolate-filling,9,9,1
2,en:Groceries,13,18,2
3,en:Mandeln,-3,-3,1
4,en:Nüsse,-3,-3,1
...,...,...,...,...
124,en:wholemeal-breads,-2,-2,1
125,en:wholemeal-sliced-breads,-2,-2,1
126,fr:Alimentaire,20,20,1
127,fr:fromages-blancs,-2,-2,1


In [35]:
ddl_query = pd.io.sql.get_schema(df, name=TABLE_NAME, con=conn_str)
print(ddl_query)


CREATE TABLE food_data (
	generic_name TEXT, 
	min_nutr_score BIGINT, 
	max_nutr_score BIGINT, 
	amount BIGINT
)




In [37]:
with conn.cursor() as curs:
    curs.execute(ddl_query)

In [15]:
def write_data_to_db(connection, table, filename):
    cursor = connection.cursor()
    with open(filename, 'r') as f:
        next(f)
        cursor.copy_from(f, table, sep=',')

    connection.commit()
    cursor.close()

In [38]:
write_data_to_db(conn, table=TABLE_NAME, filename=f"data/{TABLE_NAME}.csv")

In [46]:
with conn.cursor() as curs:
    curs.execute(f"SELECT * FROM {TABLE_NAME} LIMIT 10")
    print(curs.fetchall())

[('de:Kekse mit Nuss-Nugat-Creme-Füllung', 25, 25, 1), ('en:Biscuit-snack-with-chocolate-filling', 9, 9, 1), ('en:Groceries', 13, 18, 2), ('en:Mandeln', -3, -3, 1), ('en:Nüsse', -3, -3, 1), ('en:Nüsse und Nussprodukte', -3, -3, 1), ('en:Petit-déjeuners', 26, 26, 1), ('en:Pflanzliche Lebensmittel', -3, -3, 1), ('en:Pflanzliche Lebensmittel und Getränke', -3, -3, 1), ('en:Produits à tartiner', 26, 26, 1)]
