# Finance Data Analysis with PySpark

This notebook demonstrates basic financial data analysis using Apache Spark.

## Setup

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, sum, count, round as spark_round
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
# Create Spark session
spark = SparkSession.builder \
    .appName("FinanceAnalyticsNotebook") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")

## Load Transaction Data

In [None]:
# Load transaction data
transactions_df = spark.read.csv(
    "../data/sample_transactions.csv",
    header=True,
    inferSchema=True
)

transactions_df.printSchema()
transactions_df.show(5)

## Transaction Analysis

In [None]:
# Analyze by category
category_summary = transactions_df.groupBy("category") \
    .agg(
        count("*").alias("count"),
        spark_round(sum("amount"), 2).alias("total"),
        spark_round(avg("amount"), 2).alias("average")
    ) \
    .orderBy(col("total").desc())

category_summary.show()

In [None]:
# Visualize spending by category
category_pd = category_summary.toPandas()

plt.figure(figsize=(10, 6))
plt.bar(category_pd['category'], category_pd['total'])
plt.xlabel('Category')
plt.ylabel('Total Amount ($)')
plt.title('Total Spending by Category')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Load Stock Data

In [None]:
# Load stock data
stocks_df = spark.read.csv(
    "../data/sample_stocks.csv",
    header=True,
    inferSchema=True
)

stocks_df.printSchema()
stocks_df.show(5)

## Stock Price Analysis

In [None]:
# Analyze by symbol
stock_summary = stocks_df.groupBy("symbol") \
    .agg(
        count("*").alias("trading_days"),
        spark_round(avg("close"), 2).alias("avg_close"),
        spark_round(avg("volume"), 0).alias("avg_volume")
    ) \
    .orderBy("symbol")

stock_summary.show()

In [None]:
# Visualize average closing prices
stock_pd = stock_summary.toPandas()

plt.figure(figsize=(10, 6))
plt.bar(stock_pd['symbol'], stock_pd['avg_close'])
plt.xlabel('Stock Symbol')
plt.ylabel('Average Close Price ($)')
plt.title('Average Closing Prices by Stock')
plt.tight_layout()
plt.show()

## Cleanup

In [None]:
# Stop Spark session
spark.stop()