# Tidsserier med Spark

I denne notebooken vises noen enkle eksempler på hvordan du kan jobbe med tidsserier i PySpark. Denne notebooken er kjørt i prodmiljøet på **Dapla** med `Pyspark (local)`-kernel.

In [1]:
# Importer biblioteker
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql.functions import date_format, explode, expr, sequence
from pyspark.sql.types import DateType, DoubleType, StructField, StructType

In [2]:
# Initialize a SparkSession
spark = SparkSession.builder.master("local[1]") \
                    .appName('SparkByExamples.com') \
                    .getOrCreate()

In [3]:
# Generate a sequence of dates
dates_df = spark.range(1).select(
    explode(
        sequence(
            start=expr("date '2000-01-01'"),
            stop=expr("date '2023-08-01'"),
            step=expr("interval 1 month"),
        )
    ).alias("Date")
)
dates_df.show()

+----------+
|      Date|
+----------+
|2000-01-01|
|2000-02-01|
|2000-03-01|
|2000-04-01|
|2000-05-01|
|2000-06-01|
|2000-07-01|
|2000-08-01|
|2000-09-01|
|2000-10-01|
|2000-11-01|
|2000-12-01|
|2001-01-01|
|2001-02-01|
|2001-03-01|
|2001-04-01|
|2001-05-01|
|2001-06-01|
|2001-07-01|
|2001-08-01|
+----------+
only showing top 20 rows



In [4]:
# Genererer random walk data
schema = StructType(
    [StructField(f"serie{i:02d}", DoubleType(), True) for i in range(100)]
)

data = [
    tuple((10 + np.random.normal(0, 1, 100)).cumsum().tolist())
    for _ in range(284)  # 284 months from 2000-01 to 2023-08
]

data_df = spark.createDataFrame(data, schema=schema)

data_df.select("serie00", "serie01").show()

+------------------+------------------+
|           serie00|           serie01|
+------------------+------------------+
| 10.28552497082796| 18.80213440932389|
|   9.2989592234365| 19.29443190926363|
| 9.641691011493942| 18.39265495089296|
|10.095600235251702|19.210869145255476|
|10.598149885398882| 21.04002768054827|
| 9.589494891543382|19.444913286211904|
| 8.949769940462296|17.256517957204697|
|10.268826124760595|21.924880828385007|
| 8.433705199373218| 18.27099790039972|
| 9.591527268314453|18.644221035018294|
| 9.285955387695676| 21.07566671691805|
|10.513149406143489|21.896507412493104|
|10.746275077506867| 19.94937133987704|
|10.422822429410505| 21.00460601739491|
| 9.855151113581227|19.900805138755697|
| 9.907391678199144| 20.09316407526652|
|   11.910837422526|20.458605571861405|
|11.900773767457748|22.034612549696465|
| 9.627286028962596| 19.28652948261321|
|10.636674573465507|20.860349444400374|
+------------------+------------------+
only showing top 20 rows



In [5]:
# Legger til row index til DataFrame før join med dates_df
data_df = data_df.withColumn("row_index", expr("monotonically_increasing_id()"))

# Joiner de to datasettene
df = (
    dates_df.withColumn("row_index", expr("monotonically_increasing_id()"))
    .join(data_df, "row_index")
    .drop("row_index")
)

# Legger til år, kvartal og mnd
df = df.withColumn("Year", date_format(df.Date, "yyyy"))
df = df.withColumn("Quarter", expr("quarter(Date)"))
df = df.withColumn("Month", date_format(df.Date, "MM"))

df.select("Date",  "Year", "Quarter", "Month", "serie00", "serie01").show()

+----------+----+-------+-----+------------------+------------------+
|      Date|Year|Quarter|Month|           serie00|           serie01|
+----------+----+-------+-----+------------------+------------------+
|2000-01-01|2000|      1|   01|10.704067839807966|19.806605259386576|
|2000-02-01|2000|      1|   02| 9.993509489905344| 21.02908289357401|
|2000-03-01|2000|      1|   03|10.844481778491335|21.594184969233666|
|2000-04-01|2000|      2|   04| 9.983524978212996| 20.77975958435706|
|2000-05-01|2000|      2|   05|10.662027203131691|21.298218459469076|
|2000-06-01|2000|      2|   06|11.306200406865427| 22.21276903169298|
|2000-07-01|2000|      3|   07|10.273584558625396|19.559994622897797|
|2000-08-01|2000|      3|   08|10.992524143589362| 20.45220088288665|
|2000-09-01|2000|      3|   09|10.282296888132114|20.861802251700826|
|2000-10-01|2000|      4|   10|10.902266221974365| 22.48748166939777|
|2000-11-01|2000|      4|   11| 8.266550146677737|16.863204651845365|
|2000-12-01|2000|   