### **Importing necessary modules**

In [None]:
# Module for running pyspark in jupyter notebook
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

import os
# Manually set environment variables (local paths)
os.environ["JAVA_HOME"] = "D:/Programs/Java" # Adjust if different
os.environ["HADOOP_HOME"] = "D:/Programs/hadoop" # Adjust if different
os.environ["SPARK_HOME"] = "D:/Programs/spark/spark-3.5.6-bin-hadoop3"  # Adjust if different

import findspark
findspark.init("D:/Programs/spark/spark-3.5.6-bin-hadoop3") # Initialize findspark manually

# Modules for running pyspark
from pyspark.sql.types import IntegerType
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql.types import DoubleType

### **Setting up multiple executors using Spark's Standalone cluster**
Run the following commands in command prompt to start master and worker nodes

1) Open command prompt in the bin directory of spark \
   `cd D:\Programs\spark\spark-3.5.6-bin-hadoop3\bin` \

2) Creating master \
    `spark-class2.cmd org.apache.spark.deploy.master.Master` \
    Visit `http://localhost:8080/` to see your master \
    Copy master's IP address from there `spark://192.168.171.138:7077` \
    
3) Now create workers. Open the same bin directory in command prompt. For each executor you have to open separate command  prompt. Now create worker and connent them to master by adding the same IP as master \
    -c 1: means 1 core (specify the number accordingly) \
    -m 1G: means 1GB memory (specify the number accordingly) \
    `spark-class2.cmd org.apache.spark.deploy.worker.Worker -c 1 -m 1G spark://192.168.171.138:7077` \
    `spark-class2.cmd org.apache.spark.deploy.worker.Worker -c 2 -m 2G spark://192.168.171.138:7077` \
    `spark-class2.cmd org.apache.spark.deploy.worker.Worker -c 1 -m 2G spark://192.168.171.138:7077` \
    `spark-class2.cmd org.apache.spark.deploy.worker.Worker -c 2 -m 1G spark://192.168.171.138:7077`

Now your have created multiple executors, you can now set up spark session

### **Setting up spark session**

In [None]:
# in .master add IP of master
spark = SparkSession.builder \
    .appName("VSCodeSparkSession") \
    .master("spark://192.168.171.138:7077") \
    .getOrCreate()\

spark

In [None]:
sc = spark.sparkContext
sc.setLogLevel("ERROR")

### **Execute the Job**

In [None]:
# change the transactions_file path wherever your data resides
transactions_file = "D:/Internship/spark-project-main/pyspark_test_project/data/transactions.parquet"
df_transactions = spark.read.parquet(transactions_file)
df_transformed = (
    df_transactions
    .withColumn("amt", F.col("amt").cast(DoubleType()))
    .filter(F.col("amt") > 10)
    .groupBy("city")
    .agg(F.avg("amt").alias("avg_amt"))
)
# change the output directory whereever you have to store the results
df_transformed.write.mode("overwrite").csv(f"output/repartition_4")