## Stock price analysis using Spark

In [1]:
#pip install pandas numpy matplotlib 

In [81]:
import pyspark
from pyspark.sql import SparkSession

In [82]:
# Creating a SparkSession
spark = SparkSession.builder.appName("Stock Price Analysis using Spark").getOrCreate()

In [83]:
#Reading the Stocks data 
stocks = spark.read.csv("StockData", header=True)


In [84]:
# Displaiyng the header of the data
stocks.show(5)

+------+----------+----------+-------+--------+--------+--------+
|Ticker|      Date|Close/Last| Volume|    Open|    High|     Low|
+------+----------+----------+-------+--------+--------+--------+
| BRK-B|05/31/2023|  $321.08 |6175417|$321.12 |$322.41 |$319.39 |
| BRK-B|05/30/2023|  $322.19 |3232461|$321.86 |$322.47 |$319.00 |
| BRK-B|05/26/2023|  $320.60 |3229873|$320.44 |$322.63 |$319.67 |
| BRK-B|05/25/2023|  $319.02 |4251935|$320.56 |$320.56 |$317.71 |
| BRK-B|05/24/2023|  $320.20 |3075393|$322.71 |$323.00 |$319.56 |
+------+----------+----------+-------+--------+--------+--------+
only showing top 5 rows



In [85]:
# Printing the Schema of the Data
stocks.printSchema()

root
 |-- Ticker: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Close/Last: string (nullable = true)
 |-- Volume: string (nullable = true)
 |-- Open: string (nullable = true)
 |-- High: string (nullable = true)
 |-- Low: string (nullable = true)



In [86]:
# Select Data from selected Columns
stocks.select("Ticker").show(5)

+------+
|Ticker|
+------+
| BRK-B|
| BRK-B|
| BRK-B|
| BRK-B|
| BRK-B|
+------+
only showing top 5 rows



In [87]:
stocks.select(["Ticker", "Date"]).show(5)

+------+----------+
|Ticker|      Date|
+------+----------+
| BRK-B|05/31/2023|
| BRK-B|05/30/2023|
| BRK-B|05/26/2023|
| BRK-B|05/25/2023|
| BRK-B|05/24/2023|
+------+----------+
only showing top 5 rows



In [88]:
# Fetching all unique values from Ticker Column
stocks.select("Ticker").distinct().show()

+------+
|Ticker|
+------+
| BRK-B|
|  MSFT|
|  META|
|  TSLA|
|  AAPL|
|  AMZN|
| GOOGL|
|  NVDA|
|   TSM|
|     V|
|   QQQ|
|   SPY|
+------+



In [80]:
# Filtering Data => Select rows containing Microsoft Stocks in the last one month
from pyspark.sql import functions as F

# microsoftStocks = stocks.filter(stocks.Ticker == 'MSFT')
# microsoftStocksLastMonth = microsoftStocks.filter(microsoftStocks["Date"] == '05/23/2023').show()
#### OR ####

microsoftStocksLastMonth = stocks.filter(
    ((stocks.Ticker == 'MSFT') | (stocks.Ticker == 'V')) &
    (stocks["Date"] == '05/31/2023')
).show(15)

### OR ###

# stocksIn = stocks.filter(
#     (stocks.Ticker.isin(["MSFT", "AMZN", 'V']))  &
#     (stocks["Date"] == '05/30/2023')).show()

CodeCache: size=131072Kb used=39687Kb max_used=40256Kb free=91384Kb
 bounds [0x0000000104cf0000, 0x0000000107480000, 0x000000010ccf0000]
 total_blobs=14286 nmethods=13344 adapters=853
 compilation: disabled (not enough contiguous free space left)
+------+----------+----------+--------+--------+--------+--------+
|Ticker|      Date|Close/Last|  Volume|    Open|    High|     Low|
+------+----------+----------+--------+--------+--------+--------+
|  MSFT|05/31/2023|  $328.39 |45950550|$332.29 |$335.94 |$327.33 |
|     V|05/31/2023|  $221.03 |20460620|$219.96 |$221.53 |$216.14 |
+------+----------+----------+--------+--------+--------+--------+

+------+----------+----------+--------+--------+--------+--------+
|Ticker|      Date|Close/Last|  Volume|    Open|    High|     Low|
+------+----------+----------+--------+--------+--------+--------+
|  MSFT|05/30/2023|  $331.21 |29503070|$335.23 |$335.74 |$330.52 |
|  AMZN|05/30/2023|  $121.66 |64314810|$122.37 |$122.92 |$119.86 |
|     V|05/30/2

