In [1]:
!pip install pyspark



In [2]:
from pyspark.sql import SparkSession

In [3]:
spark=SparkSession.builder.getOrCreate()

In [4]:
df = spark.read.format("csv")\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .load("/content/details/data")

In [5]:
df.show(10)

+-------+----------+----------+--------+--------+---------+--------+
|Company|      Date|Close/Last|  Volume|    Open|     High|     Low|
+-------+----------+----------+--------+--------+---------+--------+
|   AAPL|07/17/2023|   $193.99|50520160| $191.90|  $194.32| $191.81|
|   AAPL|07/14/2023|   $190.69|41616240| $190.23|$191.1799| $189.63|
|   AAPL|07/13/2023|   $190.54|41342340| $190.50|  $191.19| $189.78|
|   AAPL|07-12-2023|   $189.77|60750250| $189.68|  $191.70| $188.47|
|   AAPL|07-11-2023|   $188.08|46638120| $189.16|  $189.30| $186.60|
|   AAPL|07-10-2023|   $188.61|59922160| $189.26|  $189.99|$187.035|
|   AAPL|07-07-2023|   $190.68|46815000| $191.41|  $192.67| $190.24|
|   AAPL|07-06-2023|   $191.81|45156010| $189.84|  $192.02| $189.20|
|   AAPL|07-05-2023|   $191.33|46920260|$191.565|  $192.98| $190.62|
|   AAPL|07-03-2023|   $192.46|31346600| $193.78|  $193.88| $191.76|
+-------+----------+----------+--------+--------+---------+--------+
only showing top 10 rows



In [6]:
df.distinct().count()

25160

In [7]:
df.printSchema()

root
 |-- Company: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Close/Last: string (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Open: string (nullable = true)
 |-- High: string (nullable = true)
 |-- Low: string (nullable = true)



In [8]:
from pyspark.sql.functions import col,date_format,to_date,max

In [9]:
dups=df.groupBy(df.columns).count().filter(col("count") > 1)
dups.count()

0

In [10]:
df.select("Company").distinct().show()

+-------+
|Company|
+-------+
|   AAPL|
|   CSCO|
|   QCOM|
|   META|
|   TSLA|
|   NFLX|
|    AMD|
|   SBUX|
|   AMZN|
|   MSFT|
+-------+



In [11]:
from pyspark.sql.functions import col, regexp_replace

In [12]:
df = df.withColumn("Date", regexp_replace(col("Date"), "/", "-"))

In [13]:
df.show()

+-------+----------+----------+---------+--------+---------+---------+
|Company|      Date|Close/Last|   Volume|    Open|     High|      Low|
+-------+----------+----------+---------+--------+---------+---------+
|   AAPL|07-17-2023|   $193.99| 50520160| $191.90|  $194.32|  $191.81|
|   AAPL|07-14-2023|   $190.69| 41616240| $190.23|$191.1799|  $189.63|
|   AAPL|07-13-2023|   $190.54| 41342340| $190.50|  $191.19|  $189.78|
|   AAPL|07-12-2023|   $189.77| 60750250| $189.68|  $191.70|  $188.47|
|   AAPL|07-11-2023|   $188.08| 46638120| $189.16|  $189.30|  $186.60|
|   AAPL|07-10-2023|   $188.61| 59922160| $189.26|  $189.99| $187.035|
|   AAPL|07-07-2023|   $190.68| 46815000| $191.41|  $192.67|  $190.24|
|   AAPL|07-06-2023|   $191.81| 45156010| $189.84|  $192.02|  $189.20|
|   AAPL|07-05-2023|   $191.33| 46920260|$191.565|  $192.98|  $190.62|
|   AAPL|07-03-2023|   $192.46| 31346600| $193.78|  $193.88|  $191.76|
|   AAPL|06-30-2023|   $193.97| 85213220| $191.63|  $194.48|  $191.26|
|   AA

In [14]:
df.printSchema()

root
 |-- Company: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Close/Last: string (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Open: string (nullable = true)
 |-- High: string (nullable = true)
 |-- Low: string (nullable = true)



In [15]:
df = df.withColumn("Date", to_date(col("Date"), "MM-dd-yyyy"))

In [16]:
df.printSchema()

root
 |-- Company: string (nullable = true)
 |-- Date: date (nullable = true)
 |-- Close/Last: string (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Open: string (nullable = true)
 |-- High: string (nullable = true)
 |-- Low: string (nullable = true)



In [36]:
df.distinct().count()

25160

In [17]:
from pyspark.sql.functions import col, min, max
df.agg(min(col("Date"))).first()[0]

datetime.date(2013, 7, 18)

In [18]:
df.agg(max(col("Date"))).first()[0]

datetime.date(2023, 7, 17)

In [19]:
df= df.withColumn("Date", date_format(col("Date"), "MM-yyyy"))

In [20]:
df.show()

+-------+-------+----------+---------+--------+---------+---------+
|Company|   Date|Close/Last|   Volume|    Open|     High|      Low|
+-------+-------+----------+---------+--------+---------+---------+
|   AAPL|07-2023|   $193.99| 50520160| $191.90|  $194.32|  $191.81|
|   AAPL|07-2023|   $190.69| 41616240| $190.23|$191.1799|  $189.63|
|   AAPL|07-2023|   $190.54| 41342340| $190.50|  $191.19|  $189.78|
|   AAPL|07-2023|   $189.77| 60750250| $189.68|  $191.70|  $188.47|
|   AAPL|07-2023|   $188.08| 46638120| $189.16|  $189.30|  $186.60|
|   AAPL|07-2023|   $188.61| 59922160| $189.26|  $189.99| $187.035|
|   AAPL|07-2023|   $190.68| 46815000| $191.41|  $192.67|  $190.24|
|   AAPL|07-2023|   $191.81| 45156010| $189.84|  $192.02|  $189.20|
|   AAPL|07-2023|   $191.33| 46920260|$191.565|  $192.98|  $190.62|
|   AAPL|07-2023|   $192.46| 31346600| $193.78|  $193.88|  $191.76|
|   AAPL|06-2023|   $193.97| 85213220| $191.63|  $194.48|  $191.26|
|   AAPL|06-2023|   $189.59| 46347310| $189.08| 

In [21]:
df.printSchema()

root
 |-- Company: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Close/Last: string (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Open: string (nullable = true)
 |-- High: string (nullable = true)
 |-- Low: string (nullable = true)



In [22]:
df.createOrReplaceTempView("df1")

In [23]:
df1= spark.sql("SELECT Company, Date, SUM(Volume) AS TotalVolume FROM df1 GROUP BY Company, Date")
df1.show()

+-------+-------+-----------+
|Company|   Date|TotalVolume|
+-------+-------+-----------+
|   MSFT|03-2022|  734347280|
|   CSCO|12-2020|  381559838|
|   META|01-2015|  542679970|
|   AMZN|09-2017| 1178955880|
|   AMZN|12-2014| 1729619840|
|    AMD|01-2018| 1324529750|
|    AMD|10-2015|  205058478|
|   NFLX|09-2015|  496044740|
|   QCOM|05-2018|  236483246|
|   AAPL|08-2020| 4070343860|
|   AAPL|11-2016| 2876917600|
|   SBUX|11-2014|  140191380|
|   META|04-2021|  421280410|
|   META|12-2016|  407956632|
|   TSLA|03-2023| 3312555340|
|   MSFT|06-2020|  765012050|
|   MSFT|10-2016|  613271080|
|   QCOM|06-2016|  209959316|
|   META|02-2022| 1128039520|
|   META|09-2020|  499179590|
+-------+-------+-----------+
only showing top 20 rows



In [24]:
df1.printSchema()

root
 |-- Company: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- TotalVolume: long (nullable = true)



In [25]:
df1 = df1.orderBy(col("Date"))
df1.show()

+-------+-------+-----------+
|Company|   Date|TotalVolume|
+-------+-------+-----------+
|    AMD|01-2014|  770095270|
|   NFLX|01-2014|  600343680|
|   MSFT|01-2014|  930464440|
|   AMZN|01-2014| 1610420640|
|   CSCO|01-2014|  867530980|
|   AAPL|01-2014| 8756058525|
|   QCOM|01-2014|  226257907|
|   SBUX|01-2014|  291127508|
|   TSLA|01-2014| 2755265642|
|   META|01-2014| 1291298300|
|   MSFT|01-2015|  914964080|
|   NFLX|01-2015|  423218741|
|   AMZN|01-2015| 2045892280|
|   CSCO|01-2015|  658881650|
|   TSLA|01-2015| 1346027010|
|   SBUX|01-2015|  266271070|
|   META|01-2015|  542679970|
|   QCOM|01-2015|  274029462|
|   AAPL|01-2015| 5197903240|
|    AMD|01-2015|  443505966|
+-------+-------+-----------+
only showing top 20 rows



In [26]:
df1.printSchema()

root
 |-- Company: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- TotalVolume: long (nullable = true)



In [27]:
from pyspark.sql.window import Window
from pyspark.sql.functions import col, dense_rank

In [28]:
df1= df1.withColumn("Rank", dense_rank().over(Window.partitionBy("Date").orderBy(col("TotalVolume").desc())))

In [29]:
df1.show()

+-------+-------+-----------+----+
|Company|   Date|TotalVolume|Rank|
+-------+-------+-----------+----+
|   AAPL|01-2014| 8756058525|   1|
|   TSLA|01-2014| 2755265642|   2|
|   AMZN|01-2014| 1610420640|   3|
|   META|01-2014| 1291298300|   4|
|   MSFT|01-2014|  930464440|   5|
|   CSCO|01-2014|  867530980|   6|
|    AMD|01-2014|  770095270|   7|
|   NFLX|01-2014|  600343680|   8|
|   SBUX|01-2014|  291127508|   9|
|   QCOM|01-2014|  226257907|  10|
|   AAPL|01-2015| 5197903240|   1|
|   AMZN|01-2015| 2045892280|   2|
|   TSLA|01-2015| 1346027010|   3|
|   MSFT|01-2015|  914964080|   4|
|   CSCO|01-2015|  658881650|   5|
|   META|01-2015|  542679970|   6|
|    AMD|01-2015|  443505966|   7|
|   NFLX|01-2015|  423218741|   8|
|   QCOM|01-2015|  274029462|   9|
|   SBUX|01-2015|  266271070|  10|
+-------+-------+-----------+----+
only showing top 20 rows



In [30]:
dfmax = df1.filter(col("Rank") == 1)

In [31]:
dfmax.show(5)

+-------+-------+-----------+----+
|Company|   Date|TotalVolume|Rank|
+-------+-------+-----------+----+
|   AAPL|01-2014| 8756058525|   1|
|   AAPL|01-2015| 5197903240|   1|
|   AAPL|01-2016| 5063363880|   1|
|   AAPL|01-2017| 2232821240|   1|
|   AAPL|01-2018| 2625956840|   1|
+-------+-------+-----------+----+
only showing top 5 rows



In [32]:
dfmax.drop('Rank').show()

+-------+-------+-----------+
|Company|   Date|TotalVolume|
+-------+-------+-----------+
|   AAPL|01-2014| 8756058525|
|   AAPL|01-2015| 5197903240|
|   AAPL|01-2016| 5063363880|
|   AAPL|01-2017| 2232821240|
|   AAPL|01-2018| 2625956840|
|   AAPL|01-2019| 3282131400|
|   TSLA|01-2020| 6124570978|
|   AAPL|01-2021| 2240877030|
|   AAPL|01-2022| 2109283410|
|   TSLA|01-2023| 3897499400|
|   AAPL|02-2014| 5882550042|
|   AAPL|02-2015| 4541143440|
|   AAPL|02-2016| 3233182360|
|   AAPL|02-2017| 2295955960|
|   AAPL|02-2018| 3685038680|
|   TSLA|02-2019| 1904977643|
|   TSLA|02-2020| 7102586599|
|   AAPL|02-2021| 1834022490|
|    AMD|02-2022| 2293819760|
|   TSLA|02-2023| 3625947300|
+-------+-------+-----------+
only showing top 20 rows



In [33]:
from pyspark.sql.functions import split

In [34]:
dfmax = dfmax.withColumn("Month", split(df["Date"], "-")[0])
dfmax = dfmax.withColumn("Year", split(df["Date"], "-")[1])

In [35]:
dfmax.show()

+-------+-------+-----------+----+-----+----+
|Company|   Date|TotalVolume|Rank|Month|Year|
+-------+-------+-----------+----+-----+----+
|   AAPL|01-2014| 8756058525|   1|   01|2014|
|   AAPL|01-2015| 5197903240|   1|   01|2015|
|   AAPL|01-2016| 5063363880|   1|   01|2016|
|   AAPL|01-2017| 2232821240|   1|   01|2017|
|   AAPL|01-2018| 2625956840|   1|   01|2018|
|   AAPL|01-2019| 3282131400|   1|   01|2019|
|   TSLA|01-2020| 6124570978|   1|   01|2020|
|   AAPL|01-2021| 2240877030|   1|   01|2021|
|   AAPL|01-2022| 2109283410|   1|   01|2022|
|   TSLA|01-2023| 3897499400|   1|   01|2023|
|   AAPL|02-2014| 5882550042|   1|   02|2014|
|   AAPL|02-2015| 4541143440|   1|   02|2015|
|   AAPL|02-2016| 3233182360|   1|   02|2016|
|   AAPL|02-2017| 2295955960|   1|   02|2017|
|   AAPL|02-2018| 3685038680|   1|   02|2018|
|   TSLA|02-2019| 1904977643|   1|   02|2019|
|   TSLA|02-2020| 7102586599|   1|   02|2020|
|   AAPL|02-2021| 1834022490|   1|   02|2021|
|    AMD|02-2022| 2293819760|   1|