In [107]:
# Import requests package:
import requests

In [108]:
# Set URL as url:
url = 'https://www.ec.europa.eu/agrifood/api/poultry/egg/prices?'

In [109]:
# Create parameter dictionary:
parameters = {
    'beginDate': '01/01/2021',
    'endDate': '22/01/2024'}

In [110]:
# Send get request and save in variable r:
r = requests.get(url, parameters)

# Print result:
print(r)

<Response [200]>


In [111]:
# Print headers:
r.headers

{'Date': 'Thu, 15 Feb 2024 09:41:54 GMT', 'Content-Type': 'application/json', 'Server': 'Europa', 'Connection': 'close', 'Content-Encoding': 'gzip'}

In [112]:
# Print cookies:
r.cookies

<RequestsCookieJar[]>

In [113]:
# Print encoding:
r.encoding

'utf-8'

In [114]:
# Apply JSON decoder and save in prices_eggs_prelim:
prices_eggs_prelim = r.json()

# Print prices_eggs_prelim:
prices_eggs_prelim

[{'beginDate': '15/01/2024',
  'endDate': '21/01/2024',
  'price': '€266.79',
  'unit': '€/100Kg',
  'farmingMethod': 'Barn',
  'marketingYear': 2024,
  'memberStateCode': 'AT',
  'memberStateName': 'Austria'},
 {'beginDate': '15/01/2024',
  'endDate': '21/01/2024',
  'price': '€334.49',
  'unit': '€/100Kg',
  'farmingMethod': 'Free range',
  'marketingYear': 2024,
  'memberStateCode': 'AT',
  'memberStateName': 'Austria'},
 {'beginDate': '15/01/2024',
  'endDate': '21/01/2024',
  'price': '€515.04',
  'unit': '€/100Kg',
  'farmingMethod': 'Organic',
  'marketingYear': 2024,
  'memberStateCode': 'AT',
  'memberStateName': 'Austria'},
 {'beginDate': '15/01/2024',
  'endDate': '21/01/2024',
  'price': '€292.47',
  'unit': '€/100Kg',
  'farmingMethod': 'Free range',
  'marketingYear': 2024,
  'memberStateCode': 'BE',
  'memberStateName': 'Belgium'},
 {'beginDate': '15/01/2024',
  'endDate': '21/01/2024',
  'price': '€255.78',
  'unit': '€/100Kg',
  'farmingMethod': 'Barn',
  'marketingYea

In [115]:
# Print type of prices_eggs_prelim:
type(prices_eggs_prelim)

list

In [116]:
# Import pandas package:
import pandas as pd

# Transform list to dataframe:
prices_eggs = pd.DataFrame(prices_eggs_prelim)

In [117]:
# Show dataframe:
prices_eggs

Unnamed: 0,beginDate,endDate,price,unit,farmingMethod,marketingYear,memberStateCode,memberStateName
0,15/01/2024,21/01/2024,€266.79,€/100Kg,Barn,2024,AT,Austria
1,15/01/2024,21/01/2024,€334.49,€/100Kg,Free range,2024,AT,Austria
2,15/01/2024,21/01/2024,€515.04,€/100Kg,Organic,2024,AT,Austria
3,15/01/2024,21/01/2024,€292.47,€/100Kg,Free range,2024,BE,Belgium
4,15/01/2024,21/01/2024,€255.78,€/100Kg,Barn,2024,BE,Belgium
...,...,...,...,...,...,...,...,...
11757,04/01/2021,10/01/2021,€241.38,€/100Kg,Organic,2021,SI,Slovenia
11758,04/01/2021,10/01/2021,€88.64,€/100Kg,Cage,2021,ES,Spain
11759,04/01/2021,10/01/2021,€310.10,€/100Kg,Organic,2021,SE,Sweden
11760,04/01/2021,10/01/2021,€218.24,€/100Kg,Free range,2021,SE,Sweden


In [118]:
# Show dataframe info:
prices_eggs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11762 entries, 0 to 11761
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   beginDate        11762 non-null  object
 1   endDate          11762 non-null  object
 2   price            11762 non-null  object
 3   unit             11762 non-null  object
 4   farmingMethod    11762 non-null  object
 5   marketingYear    11762 non-null  int64 
 6   memberStateCode  11762 non-null  object
 7   memberStateName  11762 non-null  object
dtypes: int64(1), object(7)
memory usage: 735.2+ KB


In [119]:
# It appears that there are no null values.
# However, it is possible that there are null values that are not specified as such (and that are instead indicated with
# certain characters such as "." or similar).
# In order to check this, we first list the unique values of the variables that have only a limited number of values:

In [120]:
prices_eggs.unit.unique()

array(['€/100Kg'], dtype=object)

In [121]:
prices_eggs.farmingMethod.unique()

array(['Barn', 'Free range', 'Organic', 'Cage'], dtype=object)

In [122]:
prices_eggs.marketingYear.unique()

array([2024, 2023, 2022, 2021])

In [123]:
prices_eggs.memberStateCode.unique()

array(['AT', 'BE', 'BG', 'HR', 'CY', 'CZ', 'EE', 'FI', 'FR', 'DE', 'EL',
       'HU', 'IE', 'IT', 'LV', 'LT', 'MT', 'NL', 'PL', 'PT', 'RO', 'SK',
       'SI', 'ES', 'SE', 'DK'], dtype=object)

In [124]:
prices_eggs.memberStateName.unique()

array(['Austria', 'Belgium', 'Bulgaria', 'Croatia', 'Cyprus', 'Czechia',
       'Estonia', 'Finland', 'France', 'Germany', 'Greece', 'Hungary',
       'Ireland', 'Italy', 'Latvia', 'Lithuania', 'Malta', 'Netherlands',
       'Poland', 'Portugal', 'Romania', 'Slovakia', 'Slovenia', 'Spain',
       'Sweden', 'Denmark'], dtype=object)

In [125]:
# For beginDate, endDate, and price, we sort the data by the respective variable and show the first and last
# cases in order to find out whether there are null values that are not specified as such:

In [126]:
prices_eggs[['beginDate']].sort_values(by='beginDate',ascending=True).head()

Unnamed: 0,beginDate
188,01/01/2024
210,01/01/2024
209,01/01/2024
208,01/01/2024
207,01/01/2024


In [127]:
prices_eggs[['beginDate']].sort_values(by='beginDate',ascending=True).tail()

Unnamed: 0,beginDate
4971,31/10/2022
4970,31/10/2022
4969,31/10/2022
4938,31/10/2022
4917,31/10/2022


In [128]:
prices_eggs[['endDate']].sort_values(by='endDate',ascending=True).head()

Unnamed: 0,endDate
4353,01/01/2023
4361,01/01/2023
4362,01/01/2023
4363,01/01/2023
4364,01/01/2023


In [129]:
prices_eggs[['endDate']].sort_values(by='endDate',ascending=True).tail()

Unnamed: 0,endDate
284,31/12/2023
285,31/12/2023
286,31/12/2023
279,31/12/2023
231,31/12/2023


In [130]:
prices_eggs[['price']].sort_values(by='price',ascending=True).head()

Unnamed: 0,price
9647,€100.23
10826,€100.24
11536,€100.25
9281,€100.43
9751,€100.53


In [131]:
prices_eggs[['price']].sort_values(by='price',ascending=True).tail()

Unnamed: 0,price
9893,€99.22
10103,€99.31
10672,€99.36
10208,€99.55
9935,€99.58


In [132]:
# We conclude that there are no null values in the dataframe.

In [133]:
# Show variables:
prices_eggs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11762 entries, 0 to 11761
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   beginDate        11762 non-null  object
 1   endDate          11762 non-null  object
 2   price            11762 non-null  object
 3   unit             11762 non-null  object
 4   farmingMethod    11762 non-null  object
 5   marketingYear    11762 non-null  int64 
 6   memberStateCode  11762 non-null  object
 7   memberStateName  11762 non-null  object
dtypes: int64(1), object(7)
memory usage: 735.2+ KB


In [134]:
# We do not need the marketingYear variable, especially as we do not know its exact relationship with beginDate and endDate:
prices_eggs = prices_eggs.drop(['marketingYear'], axis=1)

# We do not need the unit variable, as we know that prices are measured in €/100kg:
prices_eggs = prices_eggs.drop(['unit'], axis=1)

# Rename variable names to standard format and to names we can easily work with:
prices_eggs = prices_eggs.rename(columns={'beginDate': 'begin_date'})
prices_eggs = prices_eggs.rename(columns={'endDate': 'end_date'})
prices_eggs = prices_eggs.rename(columns={'farmingMethod': 'farming_method'})
prices_eggs = prices_eggs.rename(columns={'memberStateCode': 'country_code'})
prices_eggs = prices_eggs.rename(columns={'memberStateName': 'country'})

# Show variables again:
prices_eggs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11762 entries, 0 to 11761
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   begin_date      11762 non-null  object
 1   end_date        11762 non-null  object
 2   price           11762 non-null  object
 3   farming_method  11762 non-null  object
 4   country_code    11762 non-null  object
 5   country         11762 non-null  object
dtypes: object(6)
memory usage: 551.5+ KB


In [135]:
# Remove the € symbol in the price values:
prices_eggs['price'] = prices_eggs['price'].str.replace('€', '')

# Check results:
prices_eggs

Unnamed: 0,begin_date,end_date,price,farming_method,country_code,country
0,15/01/2024,21/01/2024,266.79,Barn,AT,Austria
1,15/01/2024,21/01/2024,334.49,Free range,AT,Austria
2,15/01/2024,21/01/2024,515.04,Organic,AT,Austria
3,15/01/2024,21/01/2024,292.47,Free range,BE,Belgium
4,15/01/2024,21/01/2024,255.78,Barn,BE,Belgium
...,...,...,...,...,...,...
11757,04/01/2021,10/01/2021,241.38,Organic,SI,Slovenia
11758,04/01/2021,10/01/2021,88.64,Cage,ES,Spain
11759,04/01/2021,10/01/2021,310.10,Organic,SE,Sweden
11760,04/01/2021,10/01/2021,218.24,Free range,SE,Sweden


In [136]:
# Transform begin_date and end_date to datetime format:
prices_eggs['begin_date'] = pd.to_datetime(prices_eggs['begin_date'], format='%d/%m/%Y')
prices_eggs['end_date'] = pd.to_datetime(prices_eggs['end_date'], format='%d/%m/%Y')

# Transform price to float:
prices_eggs['price'] = prices_eggs['price'].astype(float)

# Check results:
prices_eggs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11762 entries, 0 to 11761
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   begin_date      11762 non-null  datetime64[ns]
 1   end_date        11762 non-null  datetime64[ns]
 2   price           11762 non-null  float64       
 3   farming_method  11762 non-null  object        
 4   country_code    11762 non-null  object        
 5   country         11762 non-null  object        
dtypes: datetime64[ns](2), float64(1), object(3)
memory usage: 551.5+ KB


In [137]:
# Transform prices from the the unit €/100kg to the unit €/kg:
prices_eggs['price'] = prices_eggs['price'] / 100

# Check results:
prices_eggs

Unnamed: 0,begin_date,end_date,price,farming_method,country_code,country
0,2024-01-15,2024-01-21,2.6679,Barn,AT,Austria
1,2024-01-15,2024-01-21,3.3449,Free range,AT,Austria
2,2024-01-15,2024-01-21,5.1504,Organic,AT,Austria
3,2024-01-15,2024-01-21,2.9247,Free range,BE,Belgium
4,2024-01-15,2024-01-21,2.5578,Barn,BE,Belgium
...,...,...,...,...,...,...
11757,2021-01-04,2021-01-10,2.4138,Organic,SI,Slovenia
11758,2021-01-04,2021-01-10,0.8864,Cage,ES,Spain
11759,2021-01-04,2021-01-10,3.1010,Organic,SE,Sweden
11760,2021-01-04,2021-01-10,2.1824,Free range,SE,Sweden


In [138]:
# Now we calculate price premia (i.e., relative price differences) for organic vs. conventional eggs.

In [139]:
# Show the farming methods:
prices_eggs.farming_method.unique()

array(['Barn', 'Free range', 'Organic', 'Cage'], dtype=object)

In [140]:
# Generate dichotomous variable for product type (organic vs. conventional)

# Define function that assigns product type:
def get_product_type(farming_method):
    if farming_method == 'Organic':
        return 'Organic'
    else:
        return 'Conventional'
    
# Generate product_type variable, based on original variable on farming method:
prices_eggs['product_type'] = prices_eggs['farming_method'].apply(get_product_type)

# Check results:
prices_eggs

Unnamed: 0,begin_date,end_date,price,farming_method,country_code,country,product_type
0,2024-01-15,2024-01-21,2.6679,Barn,AT,Austria,Conventional
1,2024-01-15,2024-01-21,3.3449,Free range,AT,Austria,Conventional
2,2024-01-15,2024-01-21,5.1504,Organic,AT,Austria,Organic
3,2024-01-15,2024-01-21,2.9247,Free range,BE,Belgium,Conventional
4,2024-01-15,2024-01-21,2.5578,Barn,BE,Belgium,Conventional
...,...,...,...,...,...,...,...
11757,2021-01-04,2021-01-10,2.4138,Organic,SI,Slovenia,Organic
11758,2021-01-04,2021-01-10,0.8864,Cage,ES,Spain,Conventional
11759,2021-01-04,2021-01-10,3.1010,Organic,SE,Sweden,Organic
11760,2021-01-04,2021-01-10,2.1824,Free range,SE,Sweden,Conventional


In [141]:
# In order to calculate price premia, we need separate variables for
# the prices of organic and conventional eggs, so that we can relate the prices to each other.

In [142]:
# Create new dataframe only for organic eggs:
prices_eggs_organic = prices_eggs[prices_eggs['product_type'] == "Organic"]

# Rename price variable to specify the product type:
prices_eggs_organic = prices_eggs_organic.rename(columns={'price': 'price_organic'})

# Show dataframe:
prices_eggs_organic

Unnamed: 0,begin_date,end_date,price_organic,farming_method,country_code,country,product_type
2,2024-01-15,2024-01-21,5.1504,Organic,AT,Austria,Organic
6,2024-01-15,2024-01-21,3.2715,Organic,BE,Belgium,Organic
12,2024-01-15,2024-01-21,7.1490,Organic,CY,Cyprus,Organic
19,2024-01-15,2024-01-21,5.1217,Organic,EE,Estonia,Organic
23,2024-01-15,2024-01-21,3.7493,Organic,FI,Finland,Organic
...,...,...,...,...,...,...,...
11730,2021-01-04,2021-01-10,1.3227,Organic,EL,Greece,Organic
11735,2021-01-04,2021-01-10,1.5708,Organic,IE,Ireland,Organic
11748,2021-01-04,2021-01-10,3.1348,Organic,PL,Poland,Organic
11757,2021-01-04,2021-01-10,2.4138,Organic,SI,Slovenia,Organic


In [143]:
# Create new dataframe only for conventional eggs,
# based on the average price of conventional eggs in each week and country:
prices_eggs_convent = prices_eggs[prices_eggs['product_type'] == "Conventional"].groupby(['begin_date', 'end_date', 'country_code'])['price'].mean().reset_index()

# Rename price variable to specify the product type:
prices_eggs_convent = prices_eggs_convent.rename(columns={'price': 'price_convent'})

# Show dataframe:
prices_eggs_convent

Unnamed: 0,begin_date,end_date,country_code,price_convent
0,2021-01-04,2021-01-10,AT,2.048950
1,2021-01-04,2021-01-10,BE,1.193500
2,2021-01-04,2021-01-10,BG,1.365667
3,2021-01-04,2021-01-10,CY,1.601800
4,2021-01-04,2021-01-10,CZ,1.096300
...,...,...,...,...
4027,2024-01-15,2024-01-21,PT,2.559167
4028,2024-01-15,2024-01-21,RO,2.331667
4029,2024-01-15,2024-01-21,SE,2.586550
4030,2024-01-15,2024-01-21,SI,2.746867


In [144]:
# Now we step-wise merge the prices for organic and conventional eggs (for each date and country_code) to the prices_eggs dataframe:
prices_eggs = pd.merge(prices_eggs, prices_eggs_organic[['begin_date','end_date','country_code','price_organic']], on=['begin_date','end_date','country_code'], how='left')
prices_eggs = pd.merge(prices_eggs, prices_eggs_convent[['begin_date','end_date','country_code','price_convent']], on=['begin_date','end_date','country_code'], how='left')

# Show dataframe:
prices_eggs.head(20)

Unnamed: 0,begin_date,end_date,price,farming_method,country_code,country,product_type,price_organic,price_convent
0,2024-01-15,2024-01-21,2.6679,Barn,AT,Austria,Conventional,5.1504,3.0064
1,2024-01-15,2024-01-21,3.3449,Free range,AT,Austria,Conventional,5.1504,3.0064
2,2024-01-15,2024-01-21,5.1504,Organic,AT,Austria,Organic,5.1504,3.0064
3,2024-01-15,2024-01-21,2.9247,Free range,BE,Belgium,Conventional,3.2715,2.510933
4,2024-01-15,2024-01-21,2.5578,Barn,BE,Belgium,Conventional,3.2715,2.510933
5,2024-01-15,2024-01-21,2.0503,Cage,BE,Belgium,Conventional,3.2715,2.510933
6,2024-01-15,2024-01-21,3.2715,Organic,BE,Belgium,Organic,3.2715,2.510933
7,2024-01-15,2024-01-21,2.9033,Free range,BG,Bulgaria,Conventional,,2.577267
8,2024-01-15,2024-01-21,2.5182,Barn,BG,Bulgaria,Conventional,,2.577267
9,2024-01-15,2024-01-21,2.3103,Cage,BG,Bulgaria,Conventional,,2.577267


In [145]:
# Price premium for organic vs. conventional eggs in percent:
prices_eggs['price_organic_p'] = ( ( prices_eggs['price_organic'] - prices_eggs['price_convent'] ) / prices_eggs['price_convent'] ) * 100

# Show results:
prices_eggs.head(20)

Unnamed: 0,begin_date,end_date,price,farming_method,country_code,country,product_type,price_organic,price_convent,price_organic_p
0,2024-01-15,2024-01-21,2.6679,Barn,AT,Austria,Conventional,5.1504,3.0064,71.314529
1,2024-01-15,2024-01-21,3.3449,Free range,AT,Austria,Conventional,5.1504,3.0064,71.314529
2,2024-01-15,2024-01-21,5.1504,Organic,AT,Austria,Organic,5.1504,3.0064,71.314529
3,2024-01-15,2024-01-21,2.9247,Free range,BE,Belgium,Conventional,3.2715,2.510933,30.290198
4,2024-01-15,2024-01-21,2.5578,Barn,BE,Belgium,Conventional,3.2715,2.510933,30.290198
5,2024-01-15,2024-01-21,2.0503,Cage,BE,Belgium,Conventional,3.2715,2.510933,30.290198
6,2024-01-15,2024-01-21,3.2715,Organic,BE,Belgium,Organic,3.2715,2.510933,30.290198
7,2024-01-15,2024-01-21,2.9033,Free range,BG,Bulgaria,Conventional,,2.577267,
8,2024-01-15,2024-01-21,2.5182,Barn,BG,Bulgaria,Conventional,,2.577267,
9,2024-01-15,2024-01-21,2.3103,Cage,BG,Bulgaria,Conventional,,2.577267,


In [146]:
# The variables "price_organic" and "price_convent" are not needed anymore:
prices_eggs = prices_eggs.drop(['price_organic','price_convent'], axis=1)

In [147]:
# Now we upload the dataframe to the database on the server.

In [148]:
# Import sql_functions.py because we need some functions from that module:
import sql_functions as sqlf

# We need to restart the kernel and rerun at this point if we changed the module since we first imported it.

In [149]:
# Create a variable called engine using the get_engine function:
engine = sqlf.get_engine()

In [150]:
# We set the schema to our project name:
schema = 'capstone_organicfood'

# We set table_name to the name of the dataframe:
table_name = 'prices_eggs'

In [151]:
# We need psycopg2 for raising possible error message:
import psycopg2

In [152]:
# Write records stored in dataframe to SQL database:
if engine!=None:
    try:
        prices_eggs.to_sql(name=table_name, # name of SQL table variable
                        con=engine, # engine or connection
                        schema=schema, # our class schema variable
                        if_exists='replace', # Drop the table before inserting new values
                        index=False, # Write DataFrame index as a column
                        chunksize=5000, # Specify the number of rows in each batch to be written at a time
                        method='multi') # Pass multiple values in a single INSERT clause
        print(f"The {table_name} table was imported successfully.")
    # Error handling
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        engine = None
else:
    print('No engine')

The prices_eggs table was imported successfully.


In [153]:
# Test: query the newly created table to count the rows (we know from above that the dataframe has 11,968 cases):
sqlf.get_dataframe(f'SELECT COUNT(*) FROM {schema}.prices_eggs;')

Unnamed: 0,count
0,11968


In [154]:
# Worked!