## Stock price analysis using Spark

In [1]:
#pip install pandas numpy matplotlib 

In [40]:
import pyspark
from pyspark.sql import SparkSession
from datetime import datetime
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType, DateType, FloatType

In [41]:
# Creating a SparkSession
spark = SparkSession.builder.appName("Stock Price Analysis using Spark").getOrCreate()

In [42]:
#Reading the Stocks data 
stocks = spark.read.csv("StockData", header=True)


In [43]:
# Displaiyng the header of the data
stocks.show(5)

+------+----------+----------+-------+--------+--------+--------+
|Ticker|      Date|Close/Last| Volume|    Open|    High|     Low|
+------+----------+----------+-------+--------+--------+--------+
| BRK-B|05/31/2023|  $321.08 |6175417|$321.12 |$322.41 |$319.39 |
| BRK-B|05/30/2023|  $322.19 |3232461|$321.86 |$322.47 |$319.00 |
| BRK-B|05/26/2023|  $320.60 |3229873|$320.44 |$322.63 |$319.67 |
| BRK-B|05/25/2023|  $319.02 |4251935|$320.56 |$320.56 |$317.71 |
| BRK-B|05/24/2023|  $320.20 |3075393|$322.71 |$323.00 |$319.56 |
+------+----------+----------+-------+--------+--------+--------+
only showing top 5 rows



In [44]:
# Printing the Schema of the Data
stocks.printSchema()

root
 |-- Ticker: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Close/Last: string (nullable = true)
 |-- Volume: string (nullable = true)
 |-- Open: string (nullable = true)
 |-- High: string (nullable = true)
 |-- Low: string (nullable = true)



In [45]:
# Select Data from selected Columns
stocks.select("Ticker").show(5)

+------+
|Ticker|
+------+
| BRK-B|
| BRK-B|
| BRK-B|
| BRK-B|
| BRK-B|
+------+
only showing top 5 rows



In [46]:
stocks.select(["Ticker", "Date"]).show(5)

+------+----------+
|Ticker|      Date|
+------+----------+
| BRK-B|05/31/2023|
| BRK-B|05/30/2023|
| BRK-B|05/26/2023|
| BRK-B|05/25/2023|
| BRK-B|05/24/2023|
+------+----------+
only showing top 5 rows



In [47]:
# Fetching all unique values from Ticker Column
stocks.select("Ticker").distinct().show()

+------+
|Ticker|
+------+
| BRK-B|
|  MSFT|
|  META|
|  TSLA|
|  AAPL|
|  AMZN|
| GOOGL|
|  NVDA|
|   TSM|
|     V|
|   QQQ|
|   SPY|
+------+



In [48]:
# Filtering Data => Select rows containing Microsoft Stocks in the last one month
from pyspark.sql import functions as F

# microsoftStocks = stocks.filter(stocks.Ticker == 'MSFT')
# microsoftStocksLastMonth = microsoftStocks.filter(microsoftStocks["Date"] == '05/23/2023').show()
#### OR ####

microsoftStocksLastMonth = stocks.filter(
    ((stocks.Ticker == 'MSFT') | (stocks.Ticker == 'V')) &
    (stocks["Date"] == '05/31/2023')
).show(15)

### OR ###

# stocksIn = stocks.filter(
#     (stocks.Ticker.isin(["MSFT", "AMZN", 'V']))  &
#     (stocks["Date"] == '05/30/2023')).show()

+------+----------+----------+--------+--------+--------+--------+
|Ticker|      Date|Close/Last|  Volume|    Open|    High|     Low|
+------+----------+----------+--------+--------+--------+--------+
|  MSFT|05/31/2023|  $328.39 |45950550|$332.29 |$335.94 |$327.33 |
|     V|05/31/2023|  $221.03 |20460620|$219.96 |$221.53 |$216.14 |
+------+----------+----------+--------+--------+--------+--------+



In [49]:
## Create an UDF to change StringType of Date column to DateType
date_parser = udf(lambda date: datetime.strptime(date, "%m/%d/%Y"), DateType())

In [50]:
stocks = stocks.withColumn("ParsedDate", date_parser(stocks.Date))

In [52]:
stocks.show(5)

+------+----------+----------+-------+--------+--------+--------+----------+
|Ticker|      Date|Close/Last| Volume|    Open|    High|     Low|ParsedDate|
+------+----------+----------+-------+--------+--------+--------+----------+
| BRK-B|05/31/2023|  $321.08 |6175417|$321.12 |$322.41 |$319.39 |2023-05-31|
| BRK-B|05/30/2023|  $322.19 |3232461|$321.86 |$322.47 |$319.00 |2023-05-30|
| BRK-B|05/26/2023|  $320.60 |3229873|$320.44 |$322.63 |$319.67 |2023-05-26|
| BRK-B|05/25/2023|  $319.02 |4251935|$320.56 |$320.56 |$317.71 |2023-05-25|
| BRK-B|05/24/2023|  $320.20 |3075393|$322.71 |$323.00 |$319.56 |2023-05-24|
+------+----------+----------+-------+--------+--------+--------+----------+
only showing top 5 rows



In [53]:
stocks.printSchema()

root
 |-- Ticker: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Close/Last: string (nullable = true)
 |-- Volume: string (nullable = true)
 |-- Open: string (nullable = true)
 |-- High: string (nullable = true)
 |-- Low: string (nullable = true)
 |-- ParsedDate: date (nullable = true)



In [54]:
## remove the dollar symbol and convert it into float if it is string, leave float and int as is and return None else

def num_parser(value):
    if isinstance(value, str):
        return float(value.strip("$"))
    elif isinstance(value, float) or isinstance(value, int):
        return value
    else:
        None
print(num_parser('$222.2'))

222.2


In [55]:
number_parser = udf(num_parser, FloatType())

In [58]:
stocks = (stocks.withColumn("Close", number_parser(stocks["Close/Last"]))
                 .withColumn("Open", number_parser(stocks["Open"]))
                 .withColumn("High", number_parser(stocks["High"]))
                 .withColumn("Low", number_parser(stocks["Low"])))

In [59]:
stocks.show(5)

+------+----------+----------+-------+------+------+------+----------+------+
|Ticker|      Date|Close/Last| Volume|  Open|  High|   Low|ParsedDate| Close|
+------+----------+----------+-------+------+------+------+----------+------+
| BRK-B|05/31/2023|  $321.08 |6175417|321.12|322.41|319.39|2023-05-31|321.08|
| BRK-B|05/30/2023|  $322.19 |3232461|321.86|322.47| 319.0|2023-05-30|322.19|
| BRK-B|05/26/2023|  $320.60 |3229873|320.44|322.63|319.67|2023-05-26| 320.6|
| BRK-B|05/25/2023|  $319.02 |4251935|320.56|320.56|317.71|2023-05-25|319.02|
| BRK-B|05/24/2023|  $320.20 |3075393|322.71| 323.0|319.56|2023-05-24| 320.2|
+------+----------+----------+-------+------+------+------+----------+------+
only showing top 5 rows



In [60]:
stocks.printSchema()

root
 |-- Ticker: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Close/Last: string (nullable = true)
 |-- Volume: string (nullable = true)
 |-- Open: float (nullable = true)
 |-- High: float (nullable = true)
 |-- Low: float (nullable = true)
 |-- ParsedDate: date (nullable = true)
 |-- Close: float (nullable = true)



In [62]:
## Conveting Volume datatype form String to IntegerType
from pyspark.sql.types import IntegerType
int_parser = udf(lambda x: int(x), IntegerType())

In [66]:
stocks = stocks.withColumn("Volume", int_parser(stocks.Volume))

In [67]:
stocks.show(5)

+------+----------+----------+-------+------+------+------+----------+------+
|Ticker|      Date|Close/Last| Volume|  Open|  High|   Low|ParsedDate| Close|
+------+----------+----------+-------+------+------+------+----------+------+
| BRK-B|05/31/2023|  $321.08 |6175417|321.12|322.41|319.39|2023-05-31|321.08|
| BRK-B|05/30/2023|  $322.19 |3232461|321.86|322.47| 319.0|2023-05-30|322.19|
| BRK-B|05/26/2023|  $320.60 |3229873|320.44|322.63|319.67|2023-05-26| 320.6|
| BRK-B|05/25/2023|  $319.02 |4251935|320.56|320.56|317.71|2023-05-25|319.02|
| BRK-B|05/24/2023|  $320.20 |3075393|322.71| 323.0|319.56|2023-05-24| 320.2|
+------+----------+----------+-------+------+------+------+----------+------+
only showing top 5 rows



In [68]:
stocks.printSchema()

root
 |-- Ticker: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Close/Last: string (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Open: float (nullable = true)
 |-- High: float (nullable = true)
 |-- Low: float (nullable = true)
 |-- ParsedDate: date (nullable = true)
 |-- Close: float (nullable = true)



In [71]:
## Selecting only required Columns

cleaned_stocks = stocks[["Ticker", "ParsedDate","Close", "Volume", "Open", "High", "Low"]]

In [72]:
cleaned_stocks.printSchema()

root
 |-- Ticker: string (nullable = true)
 |-- ParsedDate: date (nullable = true)
 |-- Close: float (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Open: float (nullable = true)
 |-- High: float (nullable = true)
 |-- Low: float (nullable = true)



In [77]:
## Calculating basic statistics of the cleaned Stocks Data

cleaned_stocks[["Close", "Open", "Volume", "High", "Low"]].describe().show()

+-------+------------------+------------------+--------------------+------------------+------------------+
|summary|             Close|              Open|              Volume|              High|               Low|
+-------+------------------+------------------+--------------------+------------------+------------------+
|  count|             15108|             15108|               15108|             15108|             15108|
|   mean| 180.1256089860054|180.09656566181036|5.1868408793685466E7| 182.1253348687101| 177.9982781513109|
| stddev|101.14891782168517|101.16125813324396| 5.496484129953463E7|101.96625521621728|100.26590135955209|
|    min|             11.93|             12.07|              961133|             12.45|              11.8|
|    max|            477.71|            479.22|           914080943|            479.98|            476.06|
+-------+------------------+------------------+--------------------+------------------+------------------+

