In [2]:
import pyspark  

import os
import sys
from pyspark.sql import SparkSession
from pyspark.sql import functions as fn

In [3]:
# Initialize Spark with proper configuration for Windows
spark = SparkSession.builder \
.appName("SparkAppName") \
.master("spark://spark-master:7077") \
.getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("ERROR")

print("Spark Session initialized successfully.")

Spark Session initialized successfully.


In [None]:
csv_path = "/app/data/FULL_STOCKS.csv"
df = spark.read.csv(csv_path, header=True, inferSchema=True)

print(f"\nLoaded CSV file: {csv_path}")


Loaded CSV file: /app/data/FULL_STOCKS.csv
Total records: 10000
Total columns: 29
Total records: 10000
Total columns: 29


In [6]:
print("\nFirst 10 records:")
df.show(10)


First 10 records:
+--------------+----------+-----------+------------+----------------+--------+------------------+-----------------+------------------+---------------------+--------+----------+----------+--------------------+------------+----------------+----------+----------+------------+-----------+-------------+-------------------------+-----------------+------------------+------------------------+-------------------+---------------+-----------------+------------------+
|transaction_id| timestamp|customer_id|stock_ticker|transaction_type|quantity|average_trade_size|      stock_price|total_trade_amount|customer_account_type|day_name|is_weekend|is_holiday|stock_liquidity_tier|stock_sector|  stock_industry|day_Friday|day_Monday|day_Thursday|day_Tuesday|day_Wednesday|industry_Asset Management|industry_Hardware|industry_Oil & Gas|industry_Pharmaceuticals|industry_Renewables|industry_Retail|industry_Software|industry_Utilities|
+--------------+----------+-----------+------------+-------

In [None]:
print("\n1. Total trading volume for each stock ticker:")
q1_result = df.groupBy("stock_ticker") \
    .agg(fn.sum("quantity").alias("total_volume")) \
    .orderBy(fn.desc("total_volume"))
q1_result.show()

In [None]:
# Question 2: What is the average stock price by sector?
print("\n2. Average stock price by sector:")
q2_result = df.groupBy("stock_sector") \
    .agg(fn.avg("stock_price").alias("avg_stock_price")) \
    .orderBy(fn.desc("avg_stock_price"))
q2_result.show()

In [None]:
# Question 3: How many buy vs sell transactions occurred on weekends?
print("\n3. Buy vs Sell transactions on weekends:")
q3_result = df.filter(fn.col("is_weekend") == 1) \
    .groupBy("transaction_type") \
    .agg(fn.count("transaction_id").alias("transaction_count")) \
    .orderBy("transaction_type")
q3_result.show()

In [None]:
# Question 4: Which customers have made more than 10 transactions?
print("\n4. Customers with more than 10 transactions:")
q4_result = df.groupBy("customer_id") \
    .agg(fn.count("transaction_id").alias("transaction_count")) \
    .filter(fn.col("transaction_count") > 10) \
    .orderBy(fn.desc("transaction_count"))
print(f"Total customers with >10 transactions: {q4_result.count()}")
q4_result.show()

In [None]:
# Question 5: What is the total trade amount per day of the week, ordered from highest to lowest?
print("\n5. Total trade amount per day of the week (highest to lowest):")
q5_result = df.groupBy("day_name") \
    .agg(fn.sum("total_trade_amount").alias("total_trade_amount")) \
    .orderBy(fn.desc("total_trade_amount"))
q5_result.show()

## Spark SQL Analysis Questions

In [None]:
# Register DataFrame as temporary SQL table
df.createOrReplaceTempView("trades")

In [None]:
# SQL Question 1: What are the top 5 most traded stock tickers by total quantity?
print("SQL 1. Top 5 most traded stock tickers by total quantity:")
sql1_result = spark.sql("""
    SELECT stock_ticker, 
           SUM(quantity) as total_quantity
    FROM trades
    GROUP BY stock_ticker
    ORDER BY total_quantity DESC
    LIMIT 5
""")
sql1_result.show()

In [None]:
# SQL Question 2: What is the average trade amount by customer account type?
print("SQL 2. Average trade amount by customer account type:")
sql2_result = spark.sql("""
    SELECT customer_account_type,
           AVG(total_trade_amount) as avg_trade_amount,
           COUNT(*) as transaction_count
    FROM trades
    GROUP BY customer_account_type
    ORDER BY avg_trade_amount DESC
""")
sql2_result.show()

In [None]:
# SQL Question 3: How many transactions occurred during holidays vs non-holidays?
print("SQL 3. Transactions during holidays vs non-holidays:")
sql3_result = spark.sql("""
    SELECT 
        CASE 
            WHEN is_holiday = 1 THEN 'Holiday'
            ELSE 'Non-Holiday'
        END as period_type,
        COUNT(transaction_id) as transaction_count
    FROM trades
    GROUP BY is_holiday
    ORDER BY is_holiday DESC
""")
sql3_result.show()