In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import StringType
from transformers import pipeline
from pymongo import MongoClient

In [2]:
# Initialize Spark session
spark = SparkSession.builder \
    .appName("Streaming FAANG Stocks news from HDFS") \
    .getOrCreate()

# Initialize sentiment analysis pipeline
sentiment_analyzer = pipeline('sentiment-analysis')

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [3]:
# FAANG stocks list
faang_stocks = ["AAPL", "META", "GOOG", "AMZN", "NFLX"]

# Function to filter FAANG stocks
def filter_faang_stocks(tickers):
    ticker_list = tickers.split(',')
    faang_list = [ticker for ticker in ticker_list if ticker in faang_stocks]
    return ','.join(faang_list) if faang_list else None

# Register UDF for filtering FAANG stocks
filter_faang_udf = udf(filter_faang_stocks, StringType())


In [4]:
# Define a UDF for sentiment analysis
def analyze_sentiment(title):
    if title:
        result = sentiment_analyzer(title)
        return result[0]['label']  # 'LABEL_0' for negative, 'LABEL_1' for positive
    return None

# Register UDF for sentiment analysis
sentiment_udf = udf(analyze_sentiment, StringType())


In [5]:
# Define the schema for incoming data
schema = "title STRING, relatedTickers STRING, publisher STRING, date TIMESTAMP"

# Read the CSV file from HDFS as a stream
streaming_df = spark.readStream \
    .format("csv") \
    .schema(schema) \
    .option("path", "/user/student") \
    .option("header", "true") \
    .load()


In [6]:
# Apply filtering and transformation
filtered_df = streaming_df.withColumn('relatedTickers', filter_faang_udf(col('relatedTickers'))) \
    .filter(col('relatedTickers').isNotNull()) \
    .drop('publisher')

# Apply sentiment analysis
final_df = filtered_df.withColumn('sentiment', sentiment_udf(col('title')))


In [7]:

# Write the processed data to MongoDB
def write_to_mongodb(df, epoch_id):
    pdf = df.toPandas()
    if not pdf.empty:
        client = MongoClient("mongodb+srv://shorok:shorok147@cluster0.mdxcz.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0")
        db = client["stock_market"]
        collection = db["stocknews"]
        records = pdf.to_dict(orient='records')
        collection.insert_many(records)
        print("Data inserted into MongoDB")
    else:
        print("No data to insert")


In [8]:
# Set up streaming query to write to MongoDB
query = final_df.writeStream \
    .foreachBatch(write_to_mongodb) \
    .outputMode("append") \
    .start()

# Await termination
query.awaitTermination()


2024-09-21 23:17:05,488 WARN streaming.StreamingQueryManager: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-b7c8a3e0-feb5-4a04-b298-addf3a712acb. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
                                                                                

Data inserted into MongoDB


                                                                                

Data inserted into MongoDB


                                                                                

No data to insert
No data to insert


                                                                                

No data to insert
No data to insert
No data to insert


                                                                                

Data inserted into MongoDB


                                                                                

Data inserted into MongoDB


                                                                                

Data inserted into MongoDB


KeyboardInterrupt: 