In [33]:
import requests
import os
import json
import pandas as pd
from dotenv import load_dotenv
from google.cloud import storage

# Setting up the GCP infrastructure

In [2]:
# Load environment variables from .env file
load_dotenv()

# Check if the environment variable is set
if "GOOGLE_APPLICATION_CREDENTIALS" not in os.environ:
    raise ValueError("GOOGLE_APPLICATION_CREDENTIALS not found in environment variables.")

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] 

'credentials/farm-screener-842209070941.json'

In [3]:
#raw_data_bucket = GOOGLE_RAW_DATA_BUCKET 
raw_data_bucket = "farm-screener-raw"

In [4]:
# Create a client
client = storage.Client()

In [5]:

# List buckets
print("Buckets:")
for bucket in client.list_buckets():
    print(bucket.name)

Buckets:
farm-screener-raw


In [6]:
def upload_to_gcs(data, bucket_name, file_name):
    try:
        client = storage.Client()
        bucket = client.bucket(bucket_name)
        blob = bucket.blob(file_name)
        blob.upload_from_string(json.dumps(data))
        print(f"Data uploaded to GCS: gs://{bucket_name}/{file_name}")
    except Exception as e:
        print(f"Error uploading data to GCS: {e}")

In [7]:
import requests

def download_data(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.json()
        else:
            print(f"Failed to download data. Status code: {response.status_code}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"Error downloading data: {e}")
        return None


In [8]:

url = "https://www.hofsuche.schweizerbauern.ch/api/de/farms?offset=24&project=vomhof&size=24"
data = download_data(url)
if data:
    print("Data downloaded successfully!")
    # Now you can work with the downloaded data here

    for item in data["hits"]:
            # Add additional fields here as needed
            item['timestamp'] = 'value'
            
            # Upload the modified data item to GCS
            file_name = f"{item['_id']}.json"  # Using the 'id' field as the filename
            upload_to_gcs(item, raw_data_bucket, file_name)

else:
    print("Failed to download data. Please check the URL and try again.")


Data downloaded successfully!
Data uploaded to GCS: gs://farm-screener-raw/14959.json
Data uploaded to GCS: gs://farm-screener-raw/14953.json
Data uploaded to GCS: gs://farm-screener-raw/14923.json
Data uploaded to GCS: gs://farm-screener-raw/14944.json
Data uploaded to GCS: gs://farm-screener-raw/14839.json
Data uploaded to GCS: gs://farm-screener-raw/14940.json
Data uploaded to GCS: gs://farm-screener-raw/14941.json
Data uploaded to GCS: gs://farm-screener-raw/14950.json
Data uploaded to GCS: gs://farm-screener-raw/14929.json
Data uploaded to GCS: gs://farm-screener-raw/14921.json
Data uploaded to GCS: gs://farm-screener-raw/14928.json
Data uploaded to GCS: gs://farm-screener-raw/14937.json
Data uploaded to GCS: gs://farm-screener-raw/14932.json
Data uploaded to GCS: gs://farm-screener-raw/14915.json
Data uploaded to GCS: gs://farm-screener-raw/14918.json
Data uploaded to GCS: gs://farm-screener-raw/14911.json
Data uploaded to GCS: gs://farm-screener-raw/14912.json
Data uploaded to G

In [9]:
#data["hits"][0]["_source"]["vomhof"]["offers"]

In [31]:
product_offers = list()

for i, item in enumerate(data["hits"]):
    farm_id = item["_id"]
    print(farm_id)
    offers = item["_source"]["vomhof"]["offers"]
    for offer in offers:
        product = offer["product"]
        product_id = product["id"]
        product_name = product["name"]
        availability = offer["availability"]
        if len(availability) > 0:
            product_availability = [month["name"] for month in availability]
        else:
            product_availability = "Not specified"

        product_offers.append((farm_id, product_id, product_name, product_availability))
        #print(product)
        #print(product_availability)

14959
{'id': '147', 'name': 'Äpfel'}
Not specified
{'id': '148', 'name': 'Birnen'}
Not specified
{'id': '38', 'name': 'Blumenkohl'}
['Mai', 'Juni', 'Juli', 'August', 'September', 'Oktober', 'November']
{'id': '40', 'name': 'Broccoli'}
['Mai', 'Juni', 'Juli', 'August', 'September', 'Oktober', 'November']
{'id': '158', 'name': 'Brombeeren'}
Not specified
{'id': '2957', 'name': 'Dekoartikel'}
Not specified
{'id': '2722', 'name': 'Eingemachtes Gemüse'}
Not specified
{'id': '159', 'name': 'Erdbeeren'}
['April', 'Mai', 'Juni', 'Juli', 'August', 'September', 'Oktober']
{'id': '18', 'name': 'Essig'}
Not specified
{'id': '537', 'name': 'Geschenkkorb'}
Not specified
{'id': '52', 'name': 'Gurken'}
['Mai', 'Juni', 'Juli', 'August', 'September', 'Oktober']
{'id': '160', 'name': 'Heidelbeeren'}
Not specified
{'id': '161', 'name': 'Himbeeren'}
Not specified
{'id': '258', 'name': 'Honig'}
Not specified
{'id': '13', 'name': 'Hühnereier'}
Not specified
{'id': '55', 'name': 'Karotten'}
Not specified
{'id

In [32]:
product_offers

[('14959', '147', 'Äpfel', 'Not specified'),
 ('14959', '148', 'Birnen', 'Not specified'),
 ('14959',
  '38',
  'Blumenkohl',
  ['Mai', 'Juni', 'Juli', 'August', 'September', 'Oktober', 'November']),
 ('14959',
  '40',
  'Broccoli',
  ['Mai', 'Juni', 'Juli', 'August', 'September', 'Oktober', 'November']),
 ('14959', '158', 'Brombeeren', 'Not specified'),
 ('14959', '2957', 'Dekoartikel', 'Not specified'),
 ('14959', '2722', 'Eingemachtes Gemüse', 'Not specified'),
 ('14959',
  '159',
  'Erdbeeren',
  ['April', 'Mai', 'Juni', 'Juli', 'August', 'September', 'Oktober']),
 ('14959', '18', 'Essig', 'Not specified'),
 ('14959', '537', 'Geschenkkorb', 'Not specified'),
 ('14959',
  '52',
  'Gurken',
  ['Mai', 'Juni', 'Juli', 'August', 'September', 'Oktober']),
 ('14959', '160', 'Heidelbeeren', 'Not specified'),
 ('14959', '161', 'Himbeeren', 'Not specified'),
 ('14959', '258', 'Honig', 'Not specified'),
 ('14959', '13', 'Hühnereier', 'Not specified'),
 ('14959', '55', 'Karotten', 'Not specifi

In [35]:
columns = ["farm_id", "product_id", "product_name", "product_availability"]
df = pd.DataFrame.from_records(product_offers, columns=columns)
df

Unnamed: 0,farm_id,product_id,product_name,product_availability
0,14959,147,Äpfel,Not specified
1,14959,148,Birnen,Not specified
2,14959,38,Blumenkohl,"[Mai, Juni, Juli, August, September, Oktober, ..."
3,14959,40,Broccoli,"[Mai, Juni, Juli, August, September, Oktober, ..."
4,14959,158,Brombeeren,Not specified
...,...,...,...,...
181,14875,26,Rindfleisch,Not specified
182,14875,193,Teigwaren,Not specified
183,14875,16,Wurstwaren,Not specified
184,14875,185,Zopf,Not specified


In [20]:
for offers in data["hits"][0]["_source"]["vomhof"]["offers"]:
    #print(offers)
    product = offers["product"]
    availability = offers["availability"]
    if len(availability)>0:
        product_availability = [month["name"] for month in availability]
    else:
        product_availability = "Not specified"

    print(product
          )
    print(product_availability)

{'id': '147', 'name': 'Äpfel'}
Not specified
{'id': '148', 'name': 'Birnen'}
Not specified
{'id': '38', 'name': 'Blumenkohl'}
['Mai', 'Juni', 'Juli', 'August', 'September', 'Oktober', 'November']
{'id': '40', 'name': 'Broccoli'}
['Mai', 'Juni', 'Juli', 'August', 'September', 'Oktober', 'November']
{'id': '158', 'name': 'Brombeeren'}
Not specified
{'id': '2957', 'name': 'Dekoartikel'}
Not specified
{'id': '2722', 'name': 'Eingemachtes Gemüse'}
Not specified
{'id': '159', 'name': 'Erdbeeren'}
['April', 'Mai', 'Juni', 'Juli', 'August', 'September', 'Oktober']
{'id': '18', 'name': 'Essig'}
Not specified
{'id': '537', 'name': 'Geschenkkorb'}
Not specified
{'id': '52', 'name': 'Gurken'}
['Mai', 'Juni', 'Juli', 'August', 'September', 'Oktober']
{'id': '160', 'name': 'Heidelbeeren'}
Not specified
{'id': '161', 'name': 'Himbeeren'}
Not specified
{'id': '258', 'name': 'Honig'}
Not specified
{'id': '13', 'name': 'Hühnereier'}
Not specified
{'id': '55', 'name': 'Karotten'}
Not specified
{'id': '3'