In [1]:
import requests
import os
import json
import pandas as pd
import pyarrow.parquet as pq
import pyarrow as pa
import datetime as dt
from dotenv import load_dotenv
from google.cloud import storage
from geopy.geocoders import Nominatim

# Setting up the GCP infrastructure

In [2]:
# Load environment variables from .env file
load_dotenv()

# Check if the environment variable is set
if "GOOGLE_APPLICATION_CREDENTIALS" not in os.environ:
    raise ValueError("GOOGLE_APPLICATION_CREDENTIALS not found in environment variables.")

#os.environ['GOOGLE_APPLICATION_CREDENTIALS'] 

In [3]:
raw_data_bucket = os.environ["GOOGLE_RAW_DATA_BUCKET"] 
clean_data_bucket = os.environ["GOOGLE_CLEAN_DATA_BUCKET"] 
#raw_data_bucket = "farm-screener-raw"

In [4]:
datestamp = dt.datetime.now().date()
datestamp

datetime.date(2024, 3, 16)

In [5]:
# Create a client
client = storage.Client()

In [6]:

# List buckets
print("Buckets:")
for bucket in client.list_buckets():
    print(bucket.name)

Buckets:
farm-screener-clean
farm-screener-raw


In [7]:
def upload_to_gcs(data, bucket_name, file_name):
    try:
        client = storage.Client()
        bucket = client.bucket(bucket_name)
        blob = bucket.blob(file_name)
        blob.upload_from_string(json.dumps(data))
        print(f"Data uploaded to GCS: gs://{bucket_name}/{file_name}")
    except Exception as e:
        print(f"Error uploading data to GCS: {e}")

In [8]:
import requests

def download_data(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.json()
        else:
            print(f"Failed to download data. Status code: {response.status_code}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"Error downloading data: {e}")
        return None


In [None]:
size = 100
offset = size

while True:
    #url = "https://www.hofsuche.schweizerbauern.ch/api/de/farms?offset=24&project=vomhof&size=24"
    url = f"https://www.hofsuche.schweizerbauern.ch/api/de/farms?offset={offset}&project=vomhof&size={size}"
    data = download_data(url)
    print(f"Offset: {offset}")
    if data:
        print("Data downloaded successfully!")
        if len(data["hits"] == 0):
             break
        else:
            for item in data["hits"]:
                    # Add additional fields here as needed
                    item['timestamp'] = str(datestamp)
                    
                    # Upload the modified data item to GCS
                    file_name = f"{item['_id']}.json"  # Using the 'id' field as the filename
                    upload_to_gcs(item, raw_data_bucket, file_name)

    else:
        print("Failed to download data. Please check the URL and try again.")
    offset = offset + size


In [None]:
def get_farm_data(item):
    farm ={}
    farm["farm_id"] = item["_id"]
    farm["farm_name"] = item["_source"]["farmname"]
    farm["first_name"] = item["_source"]["first_name"]
    farm["last_name"] = item["_source"]["last_name"]
    farm["street"] = item["_source"]["street"]
    farm["zip"] = item["_source"]["zip"]
    farm["city"] = item["_source"]["city"]
    farm["canton"] = item["_source"]["canton"]["short"]
    farm["telephone"] = item["_source"]["telephone"]
    farm["mobile"] = item["_source"]["mobile"]
    farm["email"] = item["_source"]["email"]
    farm["website"] = item["_source"]["website"]
    farm["facebook_link"] = item["_source"]["facebook_link"]
    farm["timestamp"] = item["timestamp"]
    
    return farm

## Access json files in GCP bucket

In [9]:
# List JSON files in the GCS bucket
storage_client = storage.Client()
bucket = storage_client.get_bucket(raw_data_bucket)


In [None]:
# farm_data = []
# product_offers = list()

# blobs = bucket.list_blobs()
# for blob in blobs:
#     if blob.name.endswith('.json'):
#         uri = f"gs://{raw_data_bucket}/{blob.name}"
#         json_data_string = blob.download_as_string().decode("utf-8")
#         json_data = json.loads(json_data_string)

#         # extract data for each farm
#         farm_id = json_data["_id"]
#         try:
#             farm_data.append(get_farm_data(json_data))
#         except:
#             farm_data.append(farm_id)
#         offers = item["_source"]["vomhof"]["offers"]
#         for offer in offers:
#             product = offer["product"]
#             product_id = product["id"]
#             product_name = product["name"]
#             availability = offer["availability"]
#             if len(availability) > 0:
#                 product_availability = [month["name"] for month in availability]
#             else:
#                 product_availability = "Not specified"

#             product_offers.append((farm_id, product_id, product_name, product_availability))
        

In [None]:

farm_data = []
product_offers = []

blobs = bucket.list_blobs()
for blob in blobs:
    if blob.name.endswith('.json'):
        uri = f"gs://{raw_data_bucket}/{blob.name}"
        json_data_string = blob.download_as_string().decode("utf-8")
        json_data = json.loads(json_data_string)

        # Extract data for each farm
        farm_id = json_data["_id"]
        try:
            farm_data.append(get_farm_data(json_data))
        except Exception as e:
            
            print(f"Error processing farm data for farm ID {farm_id}: {e}")
            farm_data.append(farm_id)


        # 
        # farm_data.append(farm_id)  # Append farm_id regardless of the success of get_farm_data

        # try:
        #     farm_info = get_farm_data(json_data)
        #     farm_data.append(farm_info)
        # except Exception as e:
        #     print(f"Error processing farm data for farm ID {farm_id}: {e}")

        #offers = json_data["_source"]["vomhof"]["offers"]  # Access offers directly from json_data

        # Extract offers if present
        source = json_data.get("_source")
        if source is None:
            print("'_source' key not found in JSON data.")
            continue

        vomhof = source.get("vomhof")
        if vomhof is None:
            print("'vomhof' key not found in JSON data.")
            continue

        offers = vomhof.get("offers")
        if offers is None:
            print("'offers' key not found in JSON data.")
            continue


        for offer in offers:
            product = offer["product"]
            product_id = product["id"]
            product_name = product["name"]
            availability = offer["availability"]
            if len(availability) > 0:
                product_availability = [month["name"] for month in availability]
            else:
                product_availability = "Not specified"

            product_offers.append((farm_id, product_id, product_name, product_availability))



In [None]:
# farm_data = []
# product_offers = list()

# for i, item in enumerate(data["hits"]):
#     farm_id = item["_id"]
#     try:
#         farm_data.append(get_farm_data(item))
#     except:
#         farm_data.append(farm_id)
#     offers = item["_source"]["vomhof"]["offers"]
#     for offer in offers:
#         product = offer["product"]
#         product_id = product["id"]
#         product_name = product["name"]
#         availability = offer["availability"]
#         if len(availability) > 0:
#             product_availability = [month["name"] for month in availability]
#         else:
#             product_availability = "Not specified"

#         product_offers.append((farm_id, product_id, product_name, product_availability))
#         #print(product)
#         #print(product_availability)

In [None]:
columns = ["farm_id", "product_id", "product_name", "product_availability"]
product_df = pd.DataFrame.from_records(product_offers, columns=columns)
product_df

In [None]:
product_df.farm_id.value_counts()

In [None]:
product_df["farm_id"] = product_df["farm_id"].astype("int")
product_df["product_id"] = product_df["product_id"].astype("int")
product_df["product_availability"] = product_df["product_availability"].astype("str") # needed for conversion to parquet
product_df.info()

In [None]:
farm_df = pd.DataFrame.from_records(farm_data)
farm_df

In [None]:
farm_df["farm_id"]= farm_df["farm_id"].astype("int")
#farm_df["timestamp"]= date = pd.to_datetime(farm_df["timestamp"])

In [None]:
farm_df.info()

In [None]:
farm_df.head()

## Saving clean data to GCP bucket in parquet

In [10]:
farms_parquet_file_path = "farms.parquet"
offers_parquet_file_path = "offers.parquet"

In [None]:
# Write the DataFrame to a Parquet file
with storage_client.bucket(clean_data_bucket).blob(farms_parquet_file_path).open("wb") as file:
    pq.write_table(pa.Table.from_pandas(farm_df), file, compression='snappy')

print(f"DataFrame saved to Parquet file: gs://{clean_data_bucket}/{farms_parquet_file_path}")



In [None]:

with storage_client.bucket(clean_data_bucket).blob(offers_parquet_file_path).open("wb") as file:
    pq.write_table(pa.Table.from_pandas(product_df), file, compression='snappy')

print(f"DataFrame saved to Parquet file: gs://{clean_data_bucket}/{offers_parquet_file_path}")

## EDA on collected data

In [31]:
# Read the Parquet file from GCS
with storage_client.bucket(clean_data_bucket).blob(farms_parquet_file_path).open("rb") as file:
    farms_parquet_table = pq.read_table(file)

# Convert the Parquet table to a pandas DataFrame
new_farms_df = farms_parquet_table.to_pandas()

In [32]:
new_farms_df

Unnamed: 0,farm_id,farm_name,first_name,last_name,street,zip,city,canton,telephone,mobile,email,website,facebook_link,timestamp
0,10002,Hof Wolfenberg,Barbara & Martin,Venzin,Wolfenbergstrasse,8426,Lufingen,ZH,044 803 12 73,079 836 03 48,martin.venzin@wolfenberg.ch,http://www.wolfenberg.ch,,2024-03-09
1,10007,Alpkäserei Risch /under Münenberg,Ueli & Martha,Bieri-Wicki,Under Münenberg 1,6162,Rengg,LU,041 480 37 75,,marbi2@bluewin.ch,http://www.alpkaeserisch.ch,,2024-03-09
2,10009,HO DELice du Pré-Mermoud,Chantal & Daniel,Hodel,Pré-Mermoud 1,1580,Avenches,VD,,079 360 54 24,chantal.daniel@hotmail.com,https://www.hodelice.ch,,2024-03-09
3,10011,Scheidweg,Käthi & Röbi,Messmer,Hagenbuch 5,8577,Schönholzerswilen,TG,,,rkmessmer@bluewin.ch,,,2024-03-09
4,10013,Zebuhof,Nadia & Kari,Bürgi-Schelbert,Bergstrasse 12,6424,Lauerz,SZ,041 811 18 56,079 225 29 89,info@zebuhof.ch,http://www.zebuhof.ch,https://www.facebook.com/Zebubuur/,2024-03-09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2934,9990,Laueli / Alp Wittenlauenen,Erika & Theo,Emmenegger-Bucher,Laueli 1,6174,Sörenberg,LU,041 488 00 23,078 926 06 05,,http://alpgruss.ch,https://www.facebook.com/Alpgruss-101220415403839,2024-03-09
2935,9991,Zimbel,Cornelia & Martin,Keiser-Schneider,Zimbel,6340,Baar,ZG,,079 614 79 03,,,,2024-03-09
2936,9995,Gantlihof,"Fidel, Elisabeth & Christian",Kenel,Sonnenbergstrasse 2,6415,Arth,SZ,041 855 41 34,079 344 91 29,,http://www.gantlihof.ch,,2024-03-09
2937,9999,Vetsch Hattenhausen,Thomas,Vetsch,Fischbachstrasse 7,8564,Hattenhausen,TG,052 763 38 32,079 318 83 65,monikavetsch@bluewin.ch,,,2024-03-09


## Adding geolocation data

In [19]:
#new_farms_df = new_farms_df.head(10).copy()

In [37]:
# new_farms_df["latitude"] = None
# new_farms_df["longitude"] = None

# making an instance of Nominatim class
geolocator = Nominatim(user_agent="my_request")


location_data = list()
start = 0
n_items = 50

while start + n_items < len(new_farms_df):
    # loop through the rows using iterrows()
    for index, row in new_farms_df.iloc[start:start+n_items].iterrows():

        location = f"{row['street']}, {row['zip']} {row['city']}"
        print(location)

        try:
            # applying geocode method to get the location
            geolocation = geolocator.geocode(location)
            location_data.append((index, geolocation.latitude, geolocation.longitude))
            print(f"Index {index} Lat: {geolocation.latitude} / Lon: {geolocation.longitude}" )
        except:
            location_data.append((index, None, None))
            print(f"Could not retrieve geolocation for {location}")
    start = start+n_items
    print("Starting next iteration")

Wolfenbergstrasse , 8426 Lufingen
Index 0 Lat: 47.4764027 / Lon: 8.5883555
Under Münenberg 1, 6162 Rengg
Could not retrieve geolocation for Under Münenberg 1, 6162 Rengg
Pré-Mermoud 1, 1580 Avenches
Index 2 Lat: 46.8745671 / Lon: 7.0364476
Hagenbuch 5 , 8577 Schönholzerswilen
Index 3 Lat: 47.5173978 / Lon: 9.117653621560137
Bergstrasse 12 , 6424 Lauerz
Index 4 Lat: 47.0219488 / Lon: 8.5948429
Fringeliweg 315 / Haselhof, 4252 Bärschwil
Could not retrieve geolocation for Fringeliweg 315 / Haselhof, 4252 Bärschwil
Hinterdorfstrasse 83, 8753 Mollis
Index 6 Lat: 47.09907175 / Lon: 9.073350641747869
Dorfstrasse 12, 3416  Affoltern
Index 7 Lat: 47.0653657 / Lon: 7.734627041525423
Strada del Pian 5, 6563 Pian San Giacomo
Index 8 Lat: 46.4172762 / Lon: 9.2260048
Schönegg 1, 6343 Rotkreuz
Index 9 Lat: 47.1322967 / Lon: 8.4416391
Boltshausen 9, 8561 Ottoberg
Index 10 Lat: 47.5814944 / Lon: 9.0868656
Route de Chésery, 1875 Morgins
Index 11 Lat: 46.2310271 / Lon: 6.8433275
Taubenloch 3, 3273 Kappel

In [38]:
columns = ["loop_index", "lat", "lon"]

location_df = pd.DataFrame(location_data, columns=columns)
location_df.tail()

Unnamed: 0,loop_index,lat,lon
2895,2895,47.474318,9.444752
2896,2896,47.120798,8.445052
2897,2897,47.474679,8.990519
2898,2898,47.404003,9.600118
2899,2899,47.219616,9.009598


## Merge the farms_df with geolocation data

In [39]:
# Join the DataFrames on their indices
merged_df = new_farms_df.join(location_df, how="inner")
len(merged_df)
merged_df.tail()

Unnamed: 0,farm_id,farm_name,first_name,last_name,street,zip,city,canton,telephone,mobile,email,website,facebook_link,timestamp,latitude,longitude,loop_index,lat,lon
2895,9902,Meggenmüli,Simon,Hanimann,Meggenmüli,9402,Mörschwil,SG,071 866 10 74,079 688 99 62,hanimannsimon@gmail.com,http://brennholz-hanimann.ch,,2024-03-09,,,2895,47.474318,9.444752
2896,9904,Erlebnisbauernhof Gerbe,Andreas,Knüsel,Gerbe 3,6344,Meierskappel,LU,,076 433 45 34,andreas@swiss-bauernhof.ch,http://www.swiss-bauernhof.ch,,2024-03-09,,,2896,47.120798,8.445052
2897,9905,Bruggmanns-OberHofen,Thomas & Beatrice,Bruggmann,Kettstrasse 7,9542,Münchwilen,TG,,,info@bruggmanns-oberhofen.ch,http://www.bruggmanns-oberhofen.ch,,2024-03-09,,,2897,47.474679,8.990519
2898,9906,"Krüsis Hofladen, Familie Sturzenegger",Beat & Elisabeth,Sturzenegger-Krüsi,Hauptstr. 78,9436,Balgach,SG,071 722 59 49,078 602 80 98,lisakruesi@bluewin.ch,https://kruesis-hofladen.ch,https://www.facebook.com/Kr%C3%BCsis-Hofladen-...,2024-03-09,,,2898,47.404003,9.600118
2899,9907,Hof Gjuch,Monika & Cornel,Brunner,Gjuch 341,8722,Kaltbrunn,SG,,,mona_steiner@bluewin.ch,,,2024-03-09,,,2899,47.219616,9.009598


In [40]:
# List of columns to drop
columns_to_drop = ["latitude", "longitude", "loop_index"]

# Check if the columns exist in the DataFrame
existing_columns = set(merged_df.columns)
columns_to_drop = [col for col in columns_to_drop if col in existing_columns]

# Drop the columns if they exist
if columns_to_drop:
    merged_df = merged_df.drop(columns=columns_to_drop)
merged_df.tail()

Unnamed: 0,farm_id,farm_name,first_name,last_name,street,zip,city,canton,telephone,mobile,email,website,facebook_link,timestamp,lat,lon
2895,9902,Meggenmüli,Simon,Hanimann,Meggenmüli,9402,Mörschwil,SG,071 866 10 74,079 688 99 62,hanimannsimon@gmail.com,http://brennholz-hanimann.ch,,2024-03-09,47.474318,9.444752
2896,9904,Erlebnisbauernhof Gerbe,Andreas,Knüsel,Gerbe 3,6344,Meierskappel,LU,,076 433 45 34,andreas@swiss-bauernhof.ch,http://www.swiss-bauernhof.ch,,2024-03-09,47.120798,8.445052
2897,9905,Bruggmanns-OberHofen,Thomas & Beatrice,Bruggmann,Kettstrasse 7,9542,Münchwilen,TG,,,info@bruggmanns-oberhofen.ch,http://www.bruggmanns-oberhofen.ch,,2024-03-09,47.474679,8.990519
2898,9906,"Krüsis Hofladen, Familie Sturzenegger",Beat & Elisabeth,Sturzenegger-Krüsi,Hauptstr. 78,9436,Balgach,SG,071 722 59 49,078 602 80 98,lisakruesi@bluewin.ch,https://kruesis-hofladen.ch,https://www.facebook.com/Kr%C3%BCsis-Hofladen-...,2024-03-09,47.404003,9.600118
2899,9907,Hof Gjuch,Monika & Cornel,Brunner,Gjuch 341,8722,Kaltbrunn,SG,,,mona_steiner@bluewin.ch,,,2024-03-09,47.219616,9.009598


In [41]:
farms_geo_parquet_file_path = "farms_geo.parquet"

# Write the DataFrame to a Parquet file
with storage_client.bucket(clean_data_bucket).blob(farms_geo_parquet_file_path).open(
    "wb"
) as file:
    pq.write_table(pa.Table.from_pandas(merged_df), file, compression="snappy")

print(
    f"DataFrame saved to Parquet file: gs://{clean_data_bucket}/{farms_geo_parquet_file_path}"
)

DataFrame saved to Parquet file: gs://farm-screener-clean/farms_geo.parquet


In [14]:
# Read the Parquet file from GCS
with storage_client.bucket(clean_data_bucket).blob(offers_parquet_file_path).open("rb") as file:
    offers_parquet_table = pq.read_table(file)

# Convert the Parquet table to a pandas DataFrame
new_offers_df = offers_parquet_table.to_pandas()

In [15]:
new_offers_df

Unnamed: 0,farm_id,product_id,product_name,product_availability
0,10002,344,Cheminée-Holz,Not specified
1,10002,13,Hühnereier,Not specified
2,10002,222,Lammfleisch,['Februar']
3,10007,21,Apéro / Catering / Partyservice,Not specified
4,10007,31,Butter,Not specified
...,...,...,...,...
22752,14875,26,Rindfleisch,Not specified
22753,14875,193,Teigwaren,Not specified
22754,14875,16,Wurstwaren,Not specified
22755,14875,185,Zopf,Not specified


In [16]:
new_offers_df.farm_id.value_counts

<bound method IndexOpsMixin.value_counts of 0        10002
1        10002
2        10002
3        10007
4        10007
         ...  
22752    14875
22753    14875
22754    14875
22755    14875
22756    14875
Name: farm_id, Length: 22757, dtype: int64>

In [17]:
new_offers_df["product_name"].value_counts()

product_name
Hühnereier       1093
Rindfleisch       814
Apfelsaft         692
Konfitüren        645
Kartoffeln        557
                 ... 
Häute               1
Estragon            1
Gojibeeren          1
Pferdefleisch       1
Preiselbeeren       1
Name: count, Length: 318, dtype: int64

In [18]:
merged_df = pd.merge(new_farms_df, new_offers_df, on='farm_id', how='inner')

In [19]:
merged_df.tail()

Unnamed: 0,farm_id,farm_name,first_name,last_name,street,zip,city,canton,telephone,mobile,email,website,facebook_link,timestamp,product_id,product_name,product_availability
22780,9995,Gantlihof,"Fidel, Elisabeth & Christian",Kenel,Sonnenbergstrasse 2,6415,Arth,SZ,041 855 41 34,079 344 91 29,,http://www.gantlihof.ch,,2024-03-09,150,Kirschen,Not specified
22781,9995,Gantlihof,"Fidel, Elisabeth & Christian",Kenel,Sonnenbergstrasse 2,6415,Arth,SZ,041 855 41 34,079 344 91 29,,http://www.gantlihof.ch,,2024-03-09,503,Konfitüren,Not specified
22782,9995,Gantlihof,"Fidel, Elisabeth & Christian",Kenel,Sonnenbergstrasse 2,6415,Arth,SZ,041 855 41 34,079 344 91 29,,http://www.gantlihof.ch,,2024-03-09,156,Zwetschgen,Not specified
22783,9999,Vetsch Hattenhausen,Thomas,Vetsch,Fischbachstrasse 7,8564,Hattenhausen,TG,052 763 38 32,079 318 83 65,monikavetsch@bluewin.ch,,,2024-03-09,13,Hühnereier,Not specified
22784,9999,Vetsch Hattenhausen,Thomas,Vetsch,Fischbachstrasse 7,8564,Hattenhausen,TG,052 763 38 32,079 318 83 65,monikavetsch@bluewin.ch,,,2024-03-09,26,Rindfleisch,Not specified


In [20]:
merged_df.tail()

Unnamed: 0,farm_id,farm_name,first_name,last_name,street,zip,city,canton,telephone,mobile,email,website,facebook_link,timestamp,product_id,product_name,product_availability
22780,9995,Gantlihof,"Fidel, Elisabeth & Christian",Kenel,Sonnenbergstrasse 2,6415,Arth,SZ,041 855 41 34,079 344 91 29,,http://www.gantlihof.ch,,2024-03-09,150,Kirschen,Not specified
22781,9995,Gantlihof,"Fidel, Elisabeth & Christian",Kenel,Sonnenbergstrasse 2,6415,Arth,SZ,041 855 41 34,079 344 91 29,,http://www.gantlihof.ch,,2024-03-09,503,Konfitüren,Not specified
22782,9995,Gantlihof,"Fidel, Elisabeth & Christian",Kenel,Sonnenbergstrasse 2,6415,Arth,SZ,041 855 41 34,079 344 91 29,,http://www.gantlihof.ch,,2024-03-09,156,Zwetschgen,Not specified
22783,9999,Vetsch Hattenhausen,Thomas,Vetsch,Fischbachstrasse 7,8564,Hattenhausen,TG,052 763 38 32,079 318 83 65,monikavetsch@bluewin.ch,,,2024-03-09,13,Hühnereier,Not specified
22784,9999,Vetsch Hattenhausen,Thomas,Vetsch,Fischbachstrasse 7,8564,Hattenhausen,TG,052 763 38 32,079 318 83 65,monikavetsch@bluewin.ch,,,2024-03-09,26,Rindfleisch,Not specified


In [21]:
merged_df.iloc[100:130, :]

Unnamed: 0,farm_id,farm_name,first_name,last_name,street,zip,city,canton,telephone,mobile,email,website,facebook_link,timestamp,product_id,product_name,product_availability
100,10038,La Ferme des meules,Olivier,Amaudruz,Chemin de Jorattez 12,1052,Le Mont-sur-Lausanne,VD,,079 229 14 46,oamaudruz@hotmail.com,,,2024-03-09,253,Apfelsaft,Not specified
101,10038,La Ferme des meules,Olivier,Amaudruz,Chemin de Jorattez 12,1052,Le Mont-sur-Lausanne,VD,,079 229 14 46,oamaudruz@hotmail.com,,,2024-03-09,280,Hand- und Kunsthandwerk,Not specified
102,10038,La Ferme des meules,Olivier,Amaudruz,Chemin de Jorattez 12,1052,Le Mont-sur-Lausanne,VD,,079 229 14 46,oamaudruz@hotmail.com,,,2024-03-09,55,Karotten,Not specified
103,10038,La Ferme des meules,Olivier,Amaudruz,Chemin de Jorattez 12,1052,Le Mont-sur-Lausanne,VD,,079 229 14 46,oamaudruz@hotmail.com,,,2024-03-09,3,Kartoffeln,Not specified
104,10038,La Ferme des meules,Olivier,Amaudruz,Chemin de Jorattez 12,1052,Le Mont-sur-Lausanne,VD,,079 229 14 46,oamaudruz@hotmail.com,,,2024-03-09,515,Kirsch,"['Januar', 'Februar', 'März', 'April', 'Mai', ..."
105,10038,La Ferme des meules,Olivier,Amaudruz,Chemin de Jorattez 12,1052,Le Mont-sur-Lausanne,VD,,079 229 14 46,oamaudruz@hotmail.com,,,2024-03-09,150,Kirschen,Not specified
106,10038,La Ferme des meules,Olivier,Amaudruz,Chemin de Jorattez 12,1052,Le Mont-sur-Lausanne,VD,,079 229 14 46,oamaudruz@hotmail.com,,,2024-03-09,57,Knoblauch,Not specified
107,10038,La Ferme des meules,Olivier,Amaudruz,Chemin de Jorattez 12,1052,Le Mont-sur-Lausanne,VD,,079 229 14 46,oamaudruz@hotmail.com,,,2024-03-09,144,Kräutermischungen,Not specified
108,10038,La Ferme des meules,Olivier,Amaudruz,Chemin de Jorattez 12,1052,Le Mont-sur-Lausanne,VD,,079 229 14 46,oamaudruz@hotmail.com,,,2024-03-09,62,Kürbis,Not specified
109,10038,La Ferme des meules,Olivier,Amaudruz,Chemin de Jorattez 12,1052,Le Mont-sur-Lausanne,VD,,079 229 14 46,oamaudruz@hotmail.com,,,2024-03-09,151,Melonen,Not specified


In [22]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22785 entries, 0 to 22784
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   farm_id               22785 non-null  int64 
 1   farm_name             22785 non-null  object
 2   first_name            22785 non-null  object
 3   last_name             22785 non-null  object
 4   street                22785 non-null  object
 5   zip                   22785 non-null  object
 6   city                  22785 non-null  object
 7   canton                22785 non-null  object
 8   telephone             22785 non-null  object
 9   mobile                22785 non-null  object
 10  email                 22785 non-null  object
 11  website               22785 non-null  object
 12  facebook_link         9079 non-null   object
 13  timestamp             22785 non-null  object
 14  product_id            22785 non-null  int64 
 15  product_name          22785 non-null

In [None]:
honig_farm = merged_df.loc[(merged_df['product_name'] == 'Honig') & (merged_df['canton'] == 'TG')]

In [None]:
canton = "VD"
product = "Honig"#, "Milch" #"Honig" # "Speiseöl"
product_id = 13

In [None]:
farm_ids = new_offers_df.loc[new_offers_df['product_name'] == product, 'farm_id']
len(farm_ids)

In [None]:
data_filter = ((merged_df["canton"]==canton) & (merged_df["product_name"]== product))
data_filter_id = ((merged_df["canton"]==canton) & (merged_df["product_id"]== product_id))

In [None]:
merged_df[data_filter]

In [None]:
merged_df[(merged_df["canton"]==canton) & (merged_df["product_name"]== product)]
#merged_df[(merged_df["canton"]==canton) & (merged_df["product_name"]== product)]

In [None]:
merged_df.loc[merged_df["farm_id"]==9980, "product_name"]