In [1]:
import requests
import os
import json
import pandas as pd
import pyarrow.parquet as pq
import pyarrow as pa
import datetime as dt
from dotenv import load_dotenv
from google.cloud import storage

# Setting up the GCP infrastructure

In [2]:
# Load environment variables from .env file
load_dotenv()

# Check if the environment variable is set
if "GOOGLE_APPLICATION_CREDENTIALS" not in os.environ:
    raise ValueError("GOOGLE_APPLICATION_CREDENTIALS not found in environment variables.")

#os.environ['GOOGLE_APPLICATION_CREDENTIALS'] 

In [3]:
raw_data_bucket = os.environ["GOOGLE_RAW_DATA_BUCKET"] 
clean_data_bucket = os.environ["GOOGLE_CLEAN_DATA_BUCKET"] 
#raw_data_bucket = "farm-screener-raw"

In [4]:
datestamp = dt.datetime.now().date()
datestamp

datetime.date(2024, 3, 10)

In [5]:
# Create a client
client = storage.Client()

In [6]:

# List buckets
print("Buckets:")
for bucket in client.list_buckets():
    print(bucket.name)

Buckets:
farm-screener-clean
farm-screener-raw


In [7]:
def upload_to_gcs(data, bucket_name, file_name):
    try:
        client = storage.Client()
        bucket = client.bucket(bucket_name)
        blob = bucket.blob(file_name)
        blob.upload_from_string(json.dumps(data))
        print(f"Data uploaded to GCS: gs://{bucket_name}/{file_name}")
    except Exception as e:
        print(f"Error uploading data to GCS: {e}")

In [8]:
import requests

def download_data(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.json()
        else:
            print(f"Failed to download data. Status code: {response.status_code}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"Error downloading data: {e}")
        return None


In [50]:
size = 100
offset = size

while True:
    #url = "https://www.hofsuche.schweizerbauern.ch/api/de/farms?offset=24&project=vomhof&size=24"
    url = f"https://www.hofsuche.schweizerbauern.ch/api/de/farms?offset={offset}&project=vomhof&size={size}"
    data = download_data(url)
    print(f"Offset: {offset}")
    if data:
        print("Data downloaded successfully!")
        if len(data["hits"] == 0):
             break
        else:
            for item in data["hits"]:
                    # Add additional fields here as needed
                    item['timestamp'] = str(datestamp)
                    
                    # Upload the modified data item to GCS
                    file_name = f"{item['_id']}.json"  # Using the 'id' field as the filename
                    upload_to_gcs(item, raw_data_bucket, file_name)

    else:
        print("Failed to download data. Please check the URL and try again.")
    offset = offset + size


Offset: 100
Data downloaded successfully!
Data uploaded to GCS: gs://farm-screener-raw/14482.json
Data uploaded to GCS: gs://farm-screener-raw/14419.json
Data uploaded to GCS: gs://farm-screener-raw/14349.json
Data uploaded to GCS: gs://farm-screener-raw/14441.json
Data uploaded to GCS: gs://farm-screener-raw/14432.json
Data uploaded to GCS: gs://farm-screener-raw/14425.json
Data uploaded to GCS: gs://farm-screener-raw/14408.json
Data uploaded to GCS: gs://farm-screener-raw/14423.json
Data uploaded to GCS: gs://farm-screener-raw/14410.json
Data uploaded to GCS: gs://farm-screener-raw/14400.json
Data uploaded to GCS: gs://farm-screener-raw/14409.json
Data uploaded to GCS: gs://farm-screener-raw/14392.json
Data uploaded to GCS: gs://farm-screener-raw/14404.json
Data uploaded to GCS: gs://farm-screener-raw/14402.json
Data uploaded to GCS: gs://farm-screener-raw/14401.json
Data uploaded to GCS: gs://farm-screener-raw/14394.json
Data uploaded to GCS: gs://farm-screener-raw/14397.json
Data u

KeyboardInterrupt: 

In [9]:
def get_farm_data(item):
    farm ={}
    farm["farm_id"] = item["_id"]
    farm["farm_name"] = item["_source"]["farmname"]
    farm["first_name"] = item["_source"]["first_name"]
    farm["last_name"] = item["_source"]["last_name"]
    farm["street"] = item["_source"]["street"]
    farm["zip"] = item["_source"]["zip"]
    farm["city"] = item["_source"]["city"]
    farm["canton"] = item["_source"]["canton"]["short"]
    farm["telephone"] = item["_source"]["telephone"]
    farm["mobile"] = item["_source"]["mobile"]
    farm["email"] = item["_source"]["email"]
    farm["website"] = item["_source"]["website"]
    farm["facebook_link"] = item["_source"]["facebook_link"]
    farm["timestamp"] = item["timestamp"]
    
    return farm

## Access json files in GCP bucket

In [10]:
# List JSON files in the GCS bucket
storage_client = storage.Client()
bucket = storage_client.get_bucket(raw_data_bucket)


In [11]:
# farm_data = []
# product_offers = list()

# blobs = bucket.list_blobs()
# for blob in blobs:
#     if blob.name.endswith('.json'):
#         uri = f"gs://{raw_data_bucket}/{blob.name}"
#         json_data_string = blob.download_as_string().decode("utf-8")
#         json_data = json.loads(json_data_string)

#         # extract data for each farm
#         farm_id = json_data["_id"]
#         try:
#             farm_data.append(get_farm_data(json_data))
#         except:
#             farm_data.append(farm_id)
#         offers = item["_source"]["vomhof"]["offers"]
#         for offer in offers:
#             product = offer["product"]
#             product_id = product["id"]
#             product_name = product["name"]
#             availability = offer["availability"]
#             if len(availability) > 0:
#                 product_availability = [month["name"] for month in availability]
#             else:
#                 product_availability = "Not specified"

#             product_offers.append((farm_id, product_id, product_name, product_availability))
        

In [85]:

farm_data = []
product_offers = []

blobs = bucket.list_blobs()
for blob in blobs:
    if blob.name.endswith('.json'):
        uri = f"gs://{raw_data_bucket}/{blob.name}"
        json_data_string = blob.download_as_string().decode("utf-8")
        json_data = json.loads(json_data_string)

        # Extract data for each farm
        farm_id = json_data["_id"]
        try:
            farm_data.append(get_farm_data(json_data))
        except Exception as e:
            
            print(f"Error processing farm data for farm ID {farm_id}: {e}")
            farm_data.append(farm_id)


        # 
        # farm_data.append(farm_id)  # Append farm_id regardless of the success of get_farm_data

        # try:
        #     farm_info = get_farm_data(json_data)
        #     farm_data.append(farm_info)
        # except Exception as e:
        #     print(f"Error processing farm data for farm ID {farm_id}: {e}")

        #offers = json_data["_source"]["vomhof"]["offers"]  # Access offers directly from json_data

        # Extract offers if present
        source = json_data.get("_source")
        if source is None:
            print("'_source' key not found in JSON data.")
            continue

        vomhof = source.get("vomhof")
        if vomhof is None:
            print("'vomhof' key not found in JSON data.")
            continue

        offers = vomhof.get("offers")
        if offers is None:
            print("'offers' key not found in JSON data.")
            continue


        for offer in offers:
            product = offer["product"]
            product_id = product["id"]
            product_name = product["name"]
            availability = offer["availability"]
            if len(availability) > 0:
                product_availability = [month["name"] for month in availability]
            else:
                product_availability = "Not specified"

            product_offers.append((farm_id, product_id, product_name, product_availability))



'vomhof' key not found in JSON data.
'vomhof' key not found in JSON data.
'vomhof' key not found in JSON data.
'vomhof' key not found in JSON data.
'vomhof' key not found in JSON data.
'vomhof' key not found in JSON data.


In [86]:
# farm_data = []
# product_offers = list()

# for i, item in enumerate(data["hits"]):
#     farm_id = item["_id"]
#     try:
#         farm_data.append(get_farm_data(item))
#     except:
#         farm_data.append(farm_id)
#     offers = item["_source"]["vomhof"]["offers"]
#     for offer in offers:
#         product = offer["product"]
#         product_id = product["id"]
#         product_name = product["name"]
#         availability = offer["availability"]
#         if len(availability) > 0:
#             product_availability = [month["name"] for month in availability]
#         else:
#             product_availability = "Not specified"

#         product_offers.append((farm_id, product_id, product_name, product_availability))
#         #print(product)
#         #print(product_availability)

In [87]:
columns = ["farm_id", "product_id", "product_name", "product_availability"]
product_df = pd.DataFrame.from_records(product_offers, columns=columns)
product_df

Unnamed: 0,farm_id,product_id,product_name,product_availability
0,10002,344,Cheminée-Holz,Not specified
1,10002,13,Hühnereier,Not specified
2,10002,222,Lammfleisch,[Februar]
3,10007,21,Apéro / Catering / Partyservice,Not specified
4,10007,31,Butter,Not specified
...,...,...,...,...
22752,14875,26,Rindfleisch,Not specified
22753,14875,193,Teigwaren,Not specified
22754,14875,16,Wurstwaren,Not specified
22755,14875,185,Zopf,Not specified


In [88]:
product_df.farm_id.value_counts()

farm_id
9966     89
857      82
11097    78
13170    74
1967     74
         ..
13515     1
1179      1
11797     1
13506     1
8981      1
Name: count, Length: 2864, dtype: int64

In [113]:
product_df["farm_id"] = product_df["farm_id"].astype("int")
product_df["product_id"] = product_df["product_id"].astype("int")
product_df["product_availability"] = product_df["product_availability"].astype("str") # needed for conversion to parquet
product_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22757 entries, 0 to 22756
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   farm_id               22757 non-null  int64 
 1   product_id            22757 non-null  int64 
 2   product_name          22757 non-null  object
 3   product_availability  22757 non-null  object
dtypes: int64(2), object(2)
memory usage: 711.3+ KB


In [114]:
farm_df = pd.DataFrame.from_records(farm_data)
farm_df

Unnamed: 0,farm_id,farm_name,first_name,last_name,street,zip,city,canton,telephone,mobile,email,website,facebook_link,timestamp
0,10002,Hof Wolfenberg,Barbara & Martin,Venzin,Wolfenbergstrasse,8426,Lufingen,ZH,044 803 12 73,079 836 03 48,martin.venzin@wolfenberg.ch,http://www.wolfenberg.ch,,2024-03-09
1,10007,Alpkäserei Risch /under Münenberg,Ueli & Martha,Bieri-Wicki,Under Münenberg 1,6162,Rengg,LU,041 480 37 75,,marbi2@bluewin.ch,http://www.alpkaeserisch.ch,,2024-03-09
2,10009,HO DELice du Pré-Mermoud,Chantal & Daniel,Hodel,Pré-Mermoud 1,1580,Avenches,VD,,079 360 54 24,chantal.daniel@hotmail.com,https://www.hodelice.ch,,2024-03-09
3,10011,Scheidweg,Käthi & Röbi,Messmer,Hagenbuch 5,8577,Schönholzerswilen,TG,,,rkmessmer@bluewin.ch,,,2024-03-09
4,10013,Zebuhof,Nadia & Kari,Bürgi-Schelbert,Bergstrasse 12,6424,Lauerz,SZ,041 811 18 56,079 225 29 89,info@zebuhof.ch,http://www.zebuhof.ch,https://www.facebook.com/Zebubuur/,2024-03-09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2934,9990,Laueli / Alp Wittenlauenen,Erika & Theo,Emmenegger-Bucher,Laueli 1,6174,Sörenberg,LU,041 488 00 23,078 926 06 05,,http://alpgruss.ch,https://www.facebook.com/Alpgruss-101220415403839,2024-03-09
2935,9991,Zimbel,Cornelia & Martin,Keiser-Schneider,Zimbel,6340,Baar,ZG,,079 614 79 03,,,,2024-03-09
2936,9995,Gantlihof,"Fidel, Elisabeth & Christian",Kenel,Sonnenbergstrasse 2,6415,Arth,SZ,041 855 41 34,079 344 91 29,,http://www.gantlihof.ch,,2024-03-09
2937,9999,Vetsch Hattenhausen,Thomas,Vetsch,Fischbachstrasse 7,8564,Hattenhausen,TG,052 763 38 32,079 318 83 65,monikavetsch@bluewin.ch,,,2024-03-09


In [115]:
farm_df["farm_id"]= farm_df["farm_id"].astype("int")
#farm_df["timestamp"]= date = pd.to_datetime(farm_df["timestamp"])

In [116]:
farm_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2939 entries, 0 to 2938
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   farm_id        2939 non-null   int64 
 1   farm_name      2939 non-null   object
 2   first_name     2939 non-null   object
 3   last_name      2939 non-null   object
 4   street         2939 non-null   object
 5   zip            2939 non-null   object
 6   city           2939 non-null   object
 7   canton         2939 non-null   object
 8   telephone      2939 non-null   object
 9   mobile         2939 non-null   object
 10  email          2939 non-null   object
 11  website        2939 non-null   object
 12  facebook_link  995 non-null    object
 13  timestamp      2939 non-null   object
dtypes: int64(1), object(13)
memory usage: 321.6+ KB


In [117]:
farm_df.head()

Unnamed: 0,farm_id,farm_name,first_name,last_name,street,zip,city,canton,telephone,mobile,email,website,facebook_link,timestamp
0,10002,Hof Wolfenberg,Barbara & Martin,Venzin,Wolfenbergstrasse,8426,Lufingen,ZH,044 803 12 73,079 836 03 48,martin.venzin@wolfenberg.ch,http://www.wolfenberg.ch,,2024-03-09
1,10007,Alpkäserei Risch /under Münenberg,Ueli & Martha,Bieri-Wicki,Under Münenberg 1,6162,Rengg,LU,041 480 37 75,,marbi2@bluewin.ch,http://www.alpkaeserisch.ch,,2024-03-09
2,10009,HO DELice du Pré-Mermoud,Chantal & Daniel,Hodel,Pré-Mermoud 1,1580,Avenches,VD,,079 360 54 24,chantal.daniel@hotmail.com,https://www.hodelice.ch,,2024-03-09
3,10011,Scheidweg,Käthi & Röbi,Messmer,Hagenbuch 5,8577,Schönholzerswilen,TG,,,rkmessmer@bluewin.ch,,,2024-03-09
4,10013,Zebuhof,Nadia & Kari,Bürgi-Schelbert,Bergstrasse 12,6424,Lauerz,SZ,041 811 18 56,079 225 29 89,info@zebuhof.ch,http://www.zebuhof.ch,https://www.facebook.com/Zebubuur/,2024-03-09


## Saving clean data to GCP bucket in parquet

In [118]:
farms_parquet_file_path = "farms.parquet"
offers_parquet_file_path = "offers.parquet"

In [119]:
# Write the DataFrame to a Parquet file
with storage_client.bucket(clean_data_bucket).blob(farms_parquet_file_path).open("wb") as file:
    pq.write_table(pa.Table.from_pandas(farm_df), file, compression='snappy')

print(f"DataFrame saved to Parquet file: gs://{clean_data_bucket}/{farms_parquet_file_path}")



DataFrame saved to Parquet file: gs://farm-screener-clean/farms.parquet


In [120]:

with storage_client.bucket(clean_data_bucket).blob(offers_parquet_file_path).open("wb") as file:
    pq.write_table(pa.Table.from_pandas(product_df), file, compression='snappy')

print(f"DataFrame saved to Parquet file: gs://{clean_data_bucket}/{offers_parquet_file_path}")

DataFrame saved to Parquet file: gs://farm-screener-clean/offers.parquet


## EDA on collected data

In [121]:
# Read the Parquet file from GCS
with storage_client.bucket(clean_data_bucket).blob(farms_parquet_file_path).open("rb") as file:
    farms_parquet_table = pq.read_table(file)

# Convert the Parquet table to a pandas DataFrame
new_farms_df = farms_parquet_table.to_pandas()

In [122]:
new_farms_df

Unnamed: 0,farm_id,farm_name,first_name,last_name,street,zip,city,canton,telephone,mobile,email,website,facebook_link,timestamp
0,10002,Hof Wolfenberg,Barbara & Martin,Venzin,Wolfenbergstrasse,8426,Lufingen,ZH,044 803 12 73,079 836 03 48,martin.venzin@wolfenberg.ch,http://www.wolfenberg.ch,,2024-03-09
1,10007,Alpkäserei Risch /under Münenberg,Ueli & Martha,Bieri-Wicki,Under Münenberg 1,6162,Rengg,LU,041 480 37 75,,marbi2@bluewin.ch,http://www.alpkaeserisch.ch,,2024-03-09
2,10009,HO DELice du Pré-Mermoud,Chantal & Daniel,Hodel,Pré-Mermoud 1,1580,Avenches,VD,,079 360 54 24,chantal.daniel@hotmail.com,https://www.hodelice.ch,,2024-03-09
3,10011,Scheidweg,Käthi & Röbi,Messmer,Hagenbuch 5,8577,Schönholzerswilen,TG,,,rkmessmer@bluewin.ch,,,2024-03-09
4,10013,Zebuhof,Nadia & Kari,Bürgi-Schelbert,Bergstrasse 12,6424,Lauerz,SZ,041 811 18 56,079 225 29 89,info@zebuhof.ch,http://www.zebuhof.ch,https://www.facebook.com/Zebubuur/,2024-03-09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2934,9990,Laueli / Alp Wittenlauenen,Erika & Theo,Emmenegger-Bucher,Laueli 1,6174,Sörenberg,LU,041 488 00 23,078 926 06 05,,http://alpgruss.ch,https://www.facebook.com/Alpgruss-101220415403839,2024-03-09
2935,9991,Zimbel,Cornelia & Martin,Keiser-Schneider,Zimbel,6340,Baar,ZG,,079 614 79 03,,,,2024-03-09
2936,9995,Gantlihof,"Fidel, Elisabeth & Christian",Kenel,Sonnenbergstrasse 2,6415,Arth,SZ,041 855 41 34,079 344 91 29,,http://www.gantlihof.ch,,2024-03-09
2937,9999,Vetsch Hattenhausen,Thomas,Vetsch,Fischbachstrasse 7,8564,Hattenhausen,TG,052 763 38 32,079 318 83 65,monikavetsch@bluewin.ch,,,2024-03-09


In [123]:
# Read the Parquet file from GCS
with storage_client.bucket(clean_data_bucket).blob(offers_parquet_file_path).open("rb") as file:
    offers_parquet_table = pq.read_table(file)

# Convert the Parquet table to a pandas DataFrame
new_offers_df = offers_parquet_table.to_pandas()

In [124]:
new_offers_df

Unnamed: 0,farm_id,product_id,product_name,product_availability
0,10002,344,Cheminée-Holz,Not specified
1,10002,13,Hühnereier,Not specified
2,10002,222,Lammfleisch,['Februar']
3,10007,21,Apéro / Catering / Partyservice,Not specified
4,10007,31,Butter,Not specified
...,...,...,...,...
22752,14875,26,Rindfleisch,Not specified
22753,14875,193,Teigwaren,Not specified
22754,14875,16,Wurstwaren,Not specified
22755,14875,185,Zopf,Not specified


In [125]:
new_offers_df.farm_id.value_counts

<bound method IndexOpsMixin.value_counts of 0        10002
1        10002
2        10002
3        10007
4        10007
         ...  
22752    14875
22753    14875
22754    14875
22755    14875
22756    14875
Name: farm_id, Length: 22757, dtype: int64>

In [126]:
new_offers_df["product_name"].value_counts()

product_name
Hühnereier       1093
Rindfleisch       814
Apfelsaft         692
Konfitüren        645
Kartoffeln        557
                 ... 
Häute               1
Estragon            1
Gojibeeren          1
Pferdefleisch       1
Preiselbeeren       1
Name: count, Length: 318, dtype: int64

In [127]:
merged_df = pd.merge(new_farms_df, new_offers_df, on='farm_id', how='inner')

In [128]:
merged_df.tail()

Unnamed: 0,farm_id,farm_name,first_name,last_name,street,zip,city,canton,telephone,mobile,email,website,facebook_link,timestamp,product_id,product_name,product_availability
22780,14875,Guldenberg,Anita & Michael,Lienhard,Guldenberg 120,8424,Embrach,ZH,044 865 23 46,,anita.lienhard@guldenberg.ch,,https://www.facebook.com/profile.php?id=100067...,value,26,Rindfleisch,Not specified
22781,14875,Guldenberg,Anita & Michael,Lienhard,Guldenberg 120,8424,Embrach,ZH,044 865 23 46,,anita.lienhard@guldenberg.ch,,https://www.facebook.com/profile.php?id=100067...,value,193,Teigwaren,Not specified
22782,14875,Guldenberg,Anita & Michael,Lienhard,Guldenberg 120,8424,Embrach,ZH,044 865 23 46,,anita.lienhard@guldenberg.ch,,https://www.facebook.com/profile.php?id=100067...,value,16,Wurstwaren,Not specified
22783,14875,Guldenberg,Anita & Michael,Lienhard,Guldenberg 120,8424,Embrach,ZH,044 865 23 46,,anita.lienhard@guldenberg.ch,,https://www.facebook.com/profile.php?id=100067...,value,185,Zopf,Not specified
22784,14875,Guldenberg,Anita & Michael,Lienhard,Guldenberg 120,8424,Embrach,ZH,044 865 23 46,,anita.lienhard@guldenberg.ch,,https://www.facebook.com/profile.php?id=100067...,value,156,Zwetschgen,Not specified


In [129]:
merged_df.tail()

Unnamed: 0,farm_id,farm_name,first_name,last_name,street,zip,city,canton,telephone,mobile,email,website,facebook_link,timestamp,product_id,product_name,product_availability
22780,14875,Guldenberg,Anita & Michael,Lienhard,Guldenberg 120,8424,Embrach,ZH,044 865 23 46,,anita.lienhard@guldenberg.ch,,https://www.facebook.com/profile.php?id=100067...,value,26,Rindfleisch,Not specified
22781,14875,Guldenberg,Anita & Michael,Lienhard,Guldenberg 120,8424,Embrach,ZH,044 865 23 46,,anita.lienhard@guldenberg.ch,,https://www.facebook.com/profile.php?id=100067...,value,193,Teigwaren,Not specified
22782,14875,Guldenberg,Anita & Michael,Lienhard,Guldenberg 120,8424,Embrach,ZH,044 865 23 46,,anita.lienhard@guldenberg.ch,,https://www.facebook.com/profile.php?id=100067...,value,16,Wurstwaren,Not specified
22783,14875,Guldenberg,Anita & Michael,Lienhard,Guldenberg 120,8424,Embrach,ZH,044 865 23 46,,anita.lienhard@guldenberg.ch,,https://www.facebook.com/profile.php?id=100067...,value,185,Zopf,Not specified
22784,14875,Guldenberg,Anita & Michael,Lienhard,Guldenberg 120,8424,Embrach,ZH,044 865 23 46,,anita.lienhard@guldenberg.ch,,https://www.facebook.com/profile.php?id=100067...,value,156,Zwetschgen,Not specified


In [130]:
merged_df.iloc[100:130, :]

Unnamed: 0,farm_id,farm_name,first_name,last_name,street,zip,city,canton,telephone,mobile,email,website,facebook_link,timestamp,product_id,product_name,product_availability
100,10038,La Ferme des meules,Olivier,Amaudruz,Chemin de Jorattez 12,1052,Le Mont-sur-Lausanne,VD,,079 229 14 46,oamaudruz@hotmail.com,,,2024-03-09,253,Apfelsaft,Not specified
101,10038,La Ferme des meules,Olivier,Amaudruz,Chemin de Jorattez 12,1052,Le Mont-sur-Lausanne,VD,,079 229 14 46,oamaudruz@hotmail.com,,,2024-03-09,280,Hand- und Kunsthandwerk,Not specified
102,10038,La Ferme des meules,Olivier,Amaudruz,Chemin de Jorattez 12,1052,Le Mont-sur-Lausanne,VD,,079 229 14 46,oamaudruz@hotmail.com,,,2024-03-09,55,Karotten,Not specified
103,10038,La Ferme des meules,Olivier,Amaudruz,Chemin de Jorattez 12,1052,Le Mont-sur-Lausanne,VD,,079 229 14 46,oamaudruz@hotmail.com,,,2024-03-09,3,Kartoffeln,Not specified
104,10038,La Ferme des meules,Olivier,Amaudruz,Chemin de Jorattez 12,1052,Le Mont-sur-Lausanne,VD,,079 229 14 46,oamaudruz@hotmail.com,,,2024-03-09,515,Kirsch,"['Januar', 'Februar', 'März', 'April', 'Mai', ..."
105,10038,La Ferme des meules,Olivier,Amaudruz,Chemin de Jorattez 12,1052,Le Mont-sur-Lausanne,VD,,079 229 14 46,oamaudruz@hotmail.com,,,2024-03-09,150,Kirschen,Not specified
106,10038,La Ferme des meules,Olivier,Amaudruz,Chemin de Jorattez 12,1052,Le Mont-sur-Lausanne,VD,,079 229 14 46,oamaudruz@hotmail.com,,,2024-03-09,57,Knoblauch,Not specified
107,10038,La Ferme des meules,Olivier,Amaudruz,Chemin de Jorattez 12,1052,Le Mont-sur-Lausanne,VD,,079 229 14 46,oamaudruz@hotmail.com,,,2024-03-09,144,Kräutermischungen,Not specified
108,10038,La Ferme des meules,Olivier,Amaudruz,Chemin de Jorattez 12,1052,Le Mont-sur-Lausanne,VD,,079 229 14 46,oamaudruz@hotmail.com,,,2024-03-09,62,Kürbis,Not specified
109,10038,La Ferme des meules,Olivier,Amaudruz,Chemin de Jorattez 12,1052,Le Mont-sur-Lausanne,VD,,079 229 14 46,oamaudruz@hotmail.com,,,2024-03-09,151,Melonen,Not specified


In [131]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22785 entries, 0 to 22784
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   farm_id               22785 non-null  int64 
 1   farm_name             22785 non-null  object
 2   first_name            22785 non-null  object
 3   last_name             22785 non-null  object
 4   street                22785 non-null  object
 5   zip                   22785 non-null  object
 6   city                  22785 non-null  object
 7   canton                22785 non-null  object
 8   telephone             22785 non-null  object
 9   mobile                22785 non-null  object
 10  email                 22785 non-null  object
 11  website               22785 non-null  object
 12  facebook_link         9079 non-null   object
 13  timestamp             22785 non-null  object
 14  product_id            22785 non-null  int64 
 15  product_name          22785 non-null

In [132]:
honig_farm = merged_df.loc[(merged_df['product_name'] == 'Honig') & (merged_df['canton'] == 'TG')]

In [141]:
canton = "VD"
product = "Honig"#, "Milch" #"Honig" # "Speiseöl"
product_id = 13

In [142]:
farm_ids = new_offers_df.loc[new_offers_df['product_name'] == product, 'farm_id']
len(farm_ids)

549

In [143]:
data_filter = ((merged_df["canton"]==canton) & (merged_df["product_name"]== product))
data_filter_id = ((merged_df["canton"]==canton) & (merged_df["product_id"]== product_id))

In [144]:
merged_df[data_filter]

Unnamed: 0,farm_id,farm_name,first_name,last_name,street,zip,city,canton,telephone,mobile,email,website,facebook_link,timestamp,product_id,product_name,product_availability
250,10073,Agriboutix,Xavier,Bignens,Bauloz 39,1188,Gimel,VD,,079 351 58 37,x.bignens@gmail.com,http://www.agriboutix.ch,https://www.facebook.com/Agriboutix/,2024-03-09,258,Honig,Not specified
693,10203,La Cahute-famille Nicod Self de produits ferm...,Laurent & Françoise,Nicod,Route du Village 1,1085,Vulliens,VD,,078 601 61 78,lotch@bluewin.ch,,https://www.facebook.com/lacahutevulliens/,2024-03-09,258,Honig,Not specified
1837,10576,Clos Des Grillières,Philippe & Franck,Nicole,Rue du Château 10,1354,Montcherand,VD,,079 425 11 50,site@vins-nicole.ch,http://www.vins-nicole.ch,,2024-03-09,258,Honig,Not specified
2393,10773,Marché Cuendet,Mathieu,Cuendet,Route de Bussigny 66,1121,Bremblens,VD,021 803 04 98,079 777 31 14,info@marche-cuendet.ch,http://www.marche-cuendet.ch,,2024-03-09,258,Honig,Not specified
2506,10830,Schwander-fruits,Pierre-Alain,Schwander,Route de Cheseaux 12,1400,Cheseaux-Noréaz,VD,024 426 07 91,078 764 30 05,pschwander@sunrise.ch,http://www.schwander-fruits.ch,,2024-03-09,258,Honig,Not specified
4008,11302,Ferme Arc-en-Ciel,Christine & Cédric,Chezeaux,Rue du Merelez 1,1326,Juriens,VD,024 453 10 67,079 261 44 49,info@fermearcenciel.ch,http://www.fermearcenciel.ch/,https://www.facebook.com/FermeArcenCielJuriens/,2024-03-09,258,Honig,Not specified
5407,11763,Pascal et Rosalba Saveurs Rufer,Pascal & Rosalba,Rufer,Ruelle de l'Eglise 4,1114,Colombier,VD,,079 775 89 49,singarella@hotmail.com,http://www.les-saveurs-epicuriennes.ch,https://www.facebook.com/lessaveursepicuriennes/,2024-03-09,258,Honig,Not specified
7655,12554,Domaine des Rosses,Alain et Fabienne,Rochat,Chemin de la Croix-aux-Femmes 1,1136,Bussy-chardonney,VD,,079 385 19 75,domainedesrosses@bluewin.ch,http://www.domainedesrosses.ch,https://m.facebook.com/pages/category/Communit...,2024-03-09,258,Honig,Not specified
7674,12569,"""En Haut de la Dérupe""","Christian, Christelle & Joël",Buffat-Débieux,Chemin du Méregniau 21,1041,Poliez-Pittet,VD,,079 404 95 54,christelle.debieux@gmail.com,,,2024-03-09,258,Honig,"['Juni', 'August']"
8831,13102,Ferme de la Coudre,David & Maude,Chollet,Route De La Coudre 58,1613,Maracon,VD,,079 772 11 75,info@fermedelacoudre.ch,http://www.fermedelacoudre.ch,https://www.facebook.com/Ferme-de-la-Coudre-28...,2024-03-09,258,Honig,Not specified


In [38]:
merged_df[(merged_df["canton"]==canton) & (merged_df["product_name"]== product)]
#merged_df[(merged_df["canton"]==canton) & (merged_df["product_name"]== product)]

Unnamed: 0,farm_id,product_id,product_name,product_availability,farm_name,first_name,last_name,street,zip,city,canton,telephone,mobile,email,website,facebook_link,timestamp


In [39]:
merged_df.loc[merged_df["farm_id"]==9980, "product_name"]

Series([], Name: product_name, dtype: object)