In [35]:
!pip install minio
!pip install boto3
!pip install pyspark requests
!pip install pyspark kafka-python
!pip install confluent_kafka
!pip install newsdataapi

Collecting newsdataapi
  Downloading newsdataapi-0.1.18-py3-none-any.whl (7.8 kB)
Installing collected packages: newsdataapi
Successfully installed newsdataapi-0.1.18


In [6]:
from pyspark.sql import SparkSession

minio_url = "http://minio:9000"
access_key = "minio"
secret_key = "SU2orange!"
kafka_topic = "news"
spark = SparkSession.builder \
    .appName("MinioSpark") \
    .config("spark.hadoop.fs.s3a.endpoint", minio_url) \
    .config("spark.hadoop.fs.s3a.access.key", access_key)\
    .config("spark.hadoop.fs.s3a.secret.key", secret_key) \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.path.style.access", True)\
    .config("spark.hadoop.fs.s3a.fast.upload", True)\
    .config("spark.sql.streaming.kafka.useDeprecatedOffsetFetching", "true") \
    .config("spark.sql.streaming.checkpointLocation", "/tmp/spark/checkpoint") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.2")\
    .getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("ERROR")

In [49]:
import requests
import json
from kafka import KafkaProducer, KafkaConsumer
import time, re
from pyspark.sql.functions import col, from_json
from pyspark.sql.types import StringType, StructType, StructField
from kafka.admin import KafkaAdminClient, NewTopic
from minio import Minio
from minio.error import S3Error
from datetime import datetime
from io import BytesIO

In [8]:
producer = KafkaProducer(
    bootstrap_servers='broker:29092',
    value_serializer=lambda v: json.dumps(v).encode('utf-8')
)

admin_client = KafkaAdminClient(bootstrap_servers = "broker:29092")
topic = NewTopic(name=kafka_topic, num_partitions=1, replication_factor=1)
try:
    admin_client.create_topics(new_topics=[topic], validate_only=False)
    print(f"Topic '{kafka_topic}' created successfully")
except Exception as e:
    print(f"Topic '{kafka_topic}' may already exist or an error occurred: {e}")

Topic 'news' may already exist or an error occurred: [Error 36] TopicAlreadyExistsError: Request 'CreateTopicsRequest_v3(create_topic_requests=[(topic='news', num_partitions=1, replication_factor=1, replica_assignment=[], configs=[])], timeout=30000, validate_only=False)' failed with response 'CreateTopicsResponse_v3(throttle_time_ms=0, topic_errors=[(topic='news', error_code=36, error_message="Topic 'news' already exists.")])'.


In [9]:
news_api_key = "9a09fc4de8ac40779d994d79451cfae3"

In [10]:
schema = StructType([
    StructField("title", StringType(), True),
    StructField("description", StringType(), True),
    StructField("content", StringType(), True),
    StructField("url", StringType(), True),
    StructField("publishedAt", StringType(), True)
])

In [11]:
def fetch_news(api_key,producer,kafka_topic,news_topic):
    url =f"https://newsapi.org/v2/everything?q={news_topic}&apiKey={api_key}"
    response = requests.get(url)
    if response.status_code == 200:
        articles = response.json().get('articles', [])
        if articles:
            for article in articles:
                producer.send(kafka_topic, value=article)
            producer.flush()
            print(f"Sent {len(articles)} articles to Kafka topic '{kafka_topic}'")
        else:
            print(f"No articles found for topic: {news_topic}")
    else:
        print(f"Error fetching news: {response.status_code}")
        

In [51]:
if __name__ == "__main__":
    while True:
        fetch_news(news_api_key, producer, "news","olympics")
        time.sleep(360)  

Sent 100 articles to Kafka topic 'news'


KeyboardInterrupt: 

In [52]:
consumer_olympics = KafkaConsumer(
    "news",
    bootstrap_servers="broker:29092",
    value_deserializer=lambda x: json.loads(x.decode('utf-8')),
    auto_offset_reset='earliest',
    enable_auto_commit=True,
    group_id='olympics'     
)

In [53]:
minio_client = Minio(
    "minio:9000",
    access_key=access_key,
    secret_key=secret_key,
    secure=False
)

In [54]:
def upload_to_minio(article):
    published_date = datetime.fromisoformat(article['publishedAt'][:-1])
    bucket_name = published_date.strftime("%Y%m%d")
    try:
        if not minio_client.bucket_exists(bucket_name):
            minio_client.make_bucket(bucket_name)
            print(f"Bucket '{bucket_name}' created.")
    except S3Error as e:
        print(f"Error creating bucket: {e}")
    title = re.sub(r'[^\w\s]', '_', article['title']).replace(' ', '_')
    
    object_name = f"{title}-{published_date.strftime('%H%M%S')}.json"
    article_json = json.dumps(article)
    article_bytes = article_json.encode('utf-8')
    article_stream = BytesIO(article_bytes)
    
    minio_client.put_object(
        bucket_name,
        object_name,
        article_stream,
        length=len(article_bytes),
        content_type='application/json'
    )
    print(f"Uploaded article to {bucket_name}/{object_name}")
    

In [55]:
if __name__ == "__main__":
    print(f"Listening for messages on topic '{kafka_topic}'...")
    try:
        for message in consumer_olympics:
            article = message.value
            upload_to_minio(article)
    except KeyboardInterrupt:
        print("Stopping consumer...")
    finally:
        consumer_olympics.close()

Listening for messages on topic 'news'...
Bucket '20240730' created.
Uploaded article to 20240730/The_Purple_Track_at_the_2024_Summer_Olympic_Games_Has_a_Secret_Ingredient-135632.json
Bucket '20240717' created.
Uploaded article to 20240717/The_Paris_Olympics_Will_Show_Us_the_Future_of_Sports_on_TV-113000.json
Bucket '20240726' created.
Uploaded article to 20240726/How_to_Watch_the_2024_Summer_Olympics_Opening_Ceremony-130059.json
Uploaded article to 20240724/France_to_host_2030_Winter_Olympics___Paralympics-092127.json
Uploaded article to 20240724/Olympics_2024__Get_me_to_Paris-003118.json
Uploaded article to 20240726/French_High_Speed_Rail_Network_Sabotaged_Ahead_of_the_Olympics-151500.json
Bucket '20240728' created.
Uploaded article to 20240728/GB_medal_hopefuls_Eccles___Giles_suffer_heartbreak-132118.json
Uploaded article to 20240728/France_rail_repairs_completed_after_arson_attacks-104046.json
Bucket '20240725' created.
Uploaded article to 20240725/Dressel_has_doping_fears_overs_Ol