## Conda environment
- conda create -n [name] python=[python_version]
- conda activate [name]

## Local File Connection

### Initialization of Spark

In [None]:
!pip install jupyterlab
!pip install pyspark --no-cache-dir

In [None]:
import os
os.environ["JAVA_HOME"] = "C:\Program Files\Java\jdk-1.8"
os.environ['PYSPARK_PYTHON'] = 'python'

In [None]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, SQLContext

In [None]:
conf = SparkConf() \
    .setAppName("Spark Sample") \
    .setMaster("local") \
    .set("spark.driver.extraClassPath","C:/pyspark/*")

sc = SparkContext.getOrCreate(conf=conf)
spark = SparkSession(sc)

### Read the local csv

In [None]:
df=spark.read.options(delimiter=",", header=True).csv("./sales_generated.csv")

In [None]:
df.show(5, truncate=False)

In [None]:
df.createOrReplaceTempView("national_sales")

In [None]:
output = spark.sql("SELECT * FROM national_sales WHERE market_area = 'Jawa Tengah' LIMIT 10")
output.show()

In [None]:
"""Write data into Postgres"""
dest_tbl = 'public."pyspark_sales_data"'
database = "postgres"
password = "password"
user = "postgres"

In [None]:
output.write.mode("overwrite") \
    .format("jdbc") \
    .option("url", f"jdbc:postgresql://localhost:5432/{database}") \
    .option("dbtable", dest_tbl) \
    .option("user", user) \
    .option("password", password) \
    .option("driver",  "org.postgresql.Driver") \
    .save()

In [None]:
spark.stop()

## Local Hadoop Connections

In [None]:
import os

os.environ['PYSPARK_PYTHON'] = 'python'
os.environ["JAVA_HOME"] = "C:\Program Files\Java\jdk-1.8"

In [None]:
from pyspark.sql import SparkSession

# Specify the Hadoop user in the SparkSession configuration
spark = SparkSession.builder \
    .appName("Write ORC to HDFS") \
    .config("spark.yarn.submit.file.replication", "1") \
    .config("spark.hadoop.fs.defaultFS", "hdfs://localhost:9000") \
    .getOrCreate()

In [None]:
# Example DataFrame
data = [("John", 28), ("Doe", 23), ("Alice", 34)]
columns = ["Name", "Age"]
df = spark.createDataFrame(data, columns)


In [None]:
# Write ORC file to HDFS
df.write.orc("hdfs://localhost:9000/orc_files", mode="overwrite")

In [None]:
spark.stop()