In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.functions import *
import datetime
spark = SparkSession.builder.getOrCreate()

In [2]:
spark\
.read\
.option('header',True)\
.csv('/home/jovyan/usd_rub_dataset.csv')\
.createOrReplaceTempView('money_dataset')

In [3]:
#Create df
df = spark.table('money_dataset')
df.toPandas()

Unnamed: 0,Date,Price,Open,High,Low,Change %
0,Apr 14 2022,80.9957,79.8675,82.3836,79.8563,1.41%
1,Apr 13 2022,79.8675,79.6800,80.2965,79.4078,0.24%
2,Apr 12 2022,79.6800,79.0650,80.2549,78.6952,0.78%
3,Apr 11 2022,79.0650,76.0800,81.2950,76.0800,3.92%
4,Apr 08 2022,76.0800,75.7500,76.2590,71.3993,0.44%
...,...,...,...,...,...,...
5315,Sep 20 2001,29.4300,29.4550,29.4760,29.4030,-0.08%
5316,Sep 19 2001,29.4530,29.4600,29.4810,29.4300,-0.02%
5317,Sep 18 2001,29.4600,29.4450,29.4750,29.4400,-0.00%
5318,Sep 17 2001,29.4610,29.4600,29.4730,29.4400,-0.04%


In [5]:
# Convert date
df = df\
.withColumn('new_date', F.unix_timestamp('Date', 'MMM dd yyyy').cast('timestamp'))\
.withColumn('new_price', F.col('Price').cast('double'))
df.toPandas()

Unnamed: 0,Date,Price,Open,High,Low,Change %,new_date,new_price
0,Apr 14 2022,80.9957,79.8675,82.3836,79.8563,1.41%,2022-04-14,80.9957
1,Apr 13 2022,79.8675,79.6800,80.2965,79.4078,0.24%,2022-04-13,79.8675
2,Apr 12 2022,79.6800,79.0650,80.2549,78.6952,0.78%,2022-04-12,79.6800
3,Apr 11 2022,79.0650,76.0800,81.2950,76.0800,3.92%,2022-04-11,79.0650
4,Apr 08 2022,76.0800,75.7500,76.2590,71.3993,0.44%,2022-04-08,76.0800
...,...,...,...,...,...,...,...,...
5315,Sep 20 2001,29.4300,29.4550,29.4760,29.4030,-0.08%,2001-09-20,29.4300
5316,Sep 19 2001,29.4530,29.4600,29.4810,29.4300,-0.02%,2001-09-19,29.4530
5317,Sep 18 2001,29.4600,29.4450,29.4750,29.4400,-0.00%,2001-09-18,29.4600
5318,Sep 17 2001,29.4610,29.4600,29.4730,29.4400,-0.04%,2001-09-17,29.4610


In [6]:
# Testing
df\
.where(month(F.col('new_date')) == '02')\
.where(year(F.col('new_date')) == '2022')\
.select(F.col('new_date'), F.col('new_price'))\
.orderBy('new_date')\
.toPandas()

Unnamed: 0,new_date,new_price
0,2022-02-01,76.8603
1,2022-02-02,76.0472
2,2022-02-03,76.5108
3,2022-02-04,75.8385
4,2022-02-07,75.4971
5,2022-02-08,75.0091
6,2022-02-09,74.7189
7,2022-02-10,75.0216
8,2022-02-11,77.1905
9,2022-02-14,76.7799


In [7]:
updated_currency = df\
.groupBy(month(F.col('new_date')).alias('month'), year(F.col('new_date')).alias('year'))\
.agg(
    F.round(F.avg(F.col('new_price')),2).alias('avg_price'),
    F.round(F.max(F.col('new_price')),2).alias('max_price'),
    F.round(F.min(F.col('new_price')),2).alias('min_price'))
updated_currency\
.orderBy(F.col('year'),F.col('month'),ascending = False)\
.show(70)

+-----+----+---------+---------+---------+
|month|year|avg_price|max_price|min_price|
+-----+----+---------+---------+---------+
|    4|2022|    80.24|     84.0|    75.75|
|    3|2022|   106.45|    143.0|     83.2|
|    2|2022|    79.94|   106.04|    74.72|
|    1|2022|    76.62|     79.5|    74.47|
|   12|2021|    73.85|    74.65|    73.31|
|   11|2021|    72.94|    75.61|    70.72|
|   10|2021|     71.3|    72.73|    69.54|
|    9|2021|    72.88|    73.46|    72.29|
|    8|2021|    73.59|    74.26|    72.95|
|    7|2021|    73.97|    74.77|    73.14|
|    6|2021|     72.6|    73.51|    71.77|
|    5|2021|     73.9|    74.86|    73.16|
|    4|2021|    76.06|     77.4|    74.41|
|    3|2021|    74.53|    76.58|    72.82|
|    2|2021|    74.39|    76.22|    73.32|
|    1|2021|     74.4|    75.96|    73.22|
|   12|2020|    74.06|     75.9|    72.86|
|   11|2020|    76.75|    80.54|    75.47|
|   10|2020|    77.63|    79.53|    76.12|
|    9|2020|    76.09|    79.07|    73.58|
|    8|2020