In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import DoubleType, StringType
from pyspark.sql import functions as f
from uuid import uuid4
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
url = "jdbc:postgresql://db:5432/fuel_analysis"
properties = {
    "user": "root",
    "password": "root",
    "driver": "org.postgresql.Driver"
}

In [50]:
def create_uuid():
    return str(uuid4())

uuid_udf = f.udf(lambda: create_uuid(), StringType())

In [4]:
spark = SparkSession.builder \
    .appName("spark") \
    .master("local[*]") \
    .config("spark.jars", "/usr/local/spark/jars/postgresql-42.7.3.jar") \
    .getOrCreate()


In [63]:
def extract_data(path):
  df = spark.read.csv(path, sep=';', inferSchema=True, header=True)
  
  return df

def transform_data(df):
    df = df.withColumn(
        "data",
        f.to_date(f.col("data").cast(StringType()), 'dd/MM/yyyy')
        )\
        .withColumn('valor', f.regexp_replace('valor', ',', '.'))\
        .withColumn('valor', f.col('valor').cast(DoubleType()))\
        .withColumn('dia', f.dayofmonth(f.col('data')))\
        .withColumn('mes', f.month(f.col('data')))\
        .withColumn('ano', f.year(f.col('data')))\
        .withColumn('dia_semana', f.dayofweek(f.col('data')))
        
    return df


In [64]:
base_path = '/home/jovyan/data/'
path = base_path+'dolar-data-jan-2004-to-dez-2023.csv'
df = extract_data(path)

In [65]:
dollar_info = transform_data(df)

In [66]:
dollar_info.write.jdbc(url=url, table="dollar_info", mode="append", properties=properties)