# Kafka Producer Setup for Airbnb Data Flow

## Import Necessary Libraries

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr
import requests
import json
from kafka import KafkaProducer
import time

## User Defined Function to Fetch API Data
We have written an user defined function which will call the API to fetch airbnb housing data with limit and offset

In [None]:
def fetch_data(api_url, limit, offset):
    session = requests.Session()
    response = session.get(f"{api_url}?limit={limit}&offset={offset}")

    if response.status_code == 200:
        return response.json()
    else:
        return None

## Constants

In [None]:
api_url = "https://public.opendatasoft.com/api/explore/v2.1/catalog/datasets/airbnb-listings/records"
limit=100
offset=0
topic_name = "airbnb-test"

## User Defined Function to Push Messages to Kafka
The function gets a list of airbnb data and pushes each document in the list to kafka

In [None]:
# Create a Kafka producer
producer = KafkaProducer(bootstrap_servers=['localhost:9092'],
                         value_serializer=lambda x: json.dumps(x).encode('utf-8'))

def send_messages_to_kafka(results):
    for result in results:
        producer.send(topic_name, value=result)
        producer.flush()

## Iterative API Calls
Call the airbnb api in a loop until we have fetched all the 10000 records. We have a sleep of 4 seconds after each API call. We're calling the API in batches of 100 documents

In [None]:
while offset<10000:
    resp = fetch_data(api_url, limit, offset)
    airbnb_list = resp['results']
    if not airbnb_list:
        print("No more data to fetch, or an error occurred.")
        break
    print(f"pushing {len(airbnb_list)} messages to Kafka topic from offset {offset} to {offset+limit}.")
    send_messages_to_kafka(airbnb_list)
    offset += limit  # Increase the offset for the next batch
    time.sleep(4)