In [1]:
val sectorDf = spark.read
                    .format("csv")
                    .option("header", true)
                    .option("inferSchema", true)
                    .option("delimitter", ",")
                    .load("hdfs://localhost:9000/stocks/sectors")

sectorDf.printSchema()
sectorDf.show(2)

Intitializing Scala interpreter ...

Spark Web UI available at http://192.168.80.128:4041
SparkContext available as 'sc' (version = 3.1.3, master = local[*], app id = local-1648402636996)
SparkSession available as 'spark'


root
 |-- Company Name: string (nullable = true)
 |-- Industry: string (nullable = true)
 |-- Symbol: string (nullable = true)
 |-- Series: string (nullable = true)
 |-- ISIN Code: string (nullable = true)

+------------------+------------------+----------+------+------------+
|      Company Name|          Industry|    Symbol|Series|   ISIN Code|
+------------------+------------------+----------+------+------------+
|    Axis Bank Ltd.|FINANCIAL SERVICES|  AXISBANK|    EQ|INE238A01034|
|Bajaj Finance Ltd.|FINANCIAL SERVICES|BAJFINANCE|    EQ|INE296A01024|
+------------------+------------------+----------+------+------------+
only showing top 2 rows



sectorDf: org.apache.spark.sql.DataFrame = [Company Name: string, Industry: string ... 3 more fields]


In [2]:
import org.apache.spark.sql.types.{StringType, StructType, DoubleType,
                                   IntegerType, LongType, StructField }

// sector Schema
val SectorSchema = StructType(
    List(
        StructField("CompanyName", StringType, true),
        StructField("Industry", StringType, true),
        StructField("Symbol", StringType, true),
        StructField("Series", StringType, true),
        StructField("ISIN", StringType, true)
        )
    )

import org.apache.spark.sql.types.{StringType, StructType, DoubleType, IntegerType, LongType, StructField}
SectorSchema: org.apache.spark.sql.types.StructType = StructType(StructField(CompanyName,StringType,true), StructField(Industry,StringType,true), StructField(Symbol,StringType,true), StructField(Series,StringType,true), StructField(ISIN,StringType,true))


In [3]:
// Use the Schema
val sectorDf = spark.read
                    .format("csv")
                    .option("header", true)
                    .option("delimitter", ",")
                    .schema(SectorSchema)
                    .load("hdfs://localhost:9000/stocks/sectors")
sectorDf.printSchema()
sectorDf.show(2)

root
 |-- CompanyName: string (nullable = true)
 |-- Industry: string (nullable = true)
 |-- Symbol: string (nullable = true)
 |-- Series: string (nullable = true)
 |-- ISIN: string (nullable = true)

+------------------+------------------+----------+------+------------+
|       CompanyName|          Industry|    Symbol|Series|        ISIN|
+------------------+------------------+----------+------+------------+
|    Axis Bank Ltd.|FINANCIAL SERVICES|  AXISBANK|    EQ|INE238A01034|
|Bajaj Finance Ltd.|FINANCIAL SERVICES|BAJFINANCE|    EQ|INE296A01024|
+------------------+------------------+----------+------+------------+
only showing top 2 rows



sectorDf: org.apache.spark.sql.DataFrame = [CompanyName: string, Industry: string ... 3 more fields]


In [4]:
sectorDf.columns

res2: Array[String] = Array(CompanyName, Industry, Symbol, Series, ISIN)


In [5]:
sectorDf.count()

res3: Long = 200


In [6]:
val df = sectorDf.select("Industry", "Symbol")
df.printSchema()
df.show(5)

root
 |-- Industry: string (nullable = true)
 |-- Symbol: string (nullable = true)

+------------------+----------+
|          Industry|    Symbol|
+------------------+----------+
|FINANCIAL SERVICES|  AXISBANK|
|FINANCIAL SERVICES|BAJFINANCE|
|FINANCIAL SERVICES|BAJAJFINSV|
|FINANCIAL SERVICES|  CHOLAFIN|
|FINANCIAL SERVICES|   HDFCAMC|
+------------------+----------+
only showing top 5 rows



df: org.apache.spark.sql.DataFrame = [Industry: string, Symbol: string]


In [7]:
sectorDf.select("Industry").distinct().sort("Industry").show()

+--------------------+
|            Industry|
+--------------------+
|          AUTOMOBILE|
|        CONSTRUCTION|
|      CONSUMER GOODS|
|  FINANCIAL SERVICES|
| HEALTHCARE SERVICES|
|INDUSTRIAL MANUFA...|
|                  IT|
|MEDIA ENTERTAINME...|
|              METALS|
|           OIL & GAS|
|              PHARMA|
+--------------------+



In [8]:
sectorDf.select("Industry").distinct().sort("Industry").show(truncate = false)

+---------------------------------+
|Industry                         |
+---------------------------------+
|AUTOMOBILE                       |
|CONSTRUCTION                     |
|CONSUMER GOODS                   |
|FINANCIAL SERVICES               |
|HEALTHCARE SERVICES              |
|INDUSTRIAL MANUFACTURING         |
|IT                               |
|MEDIA ENTERTAINMENT & PUBLICATION|
|METALS                           |
|OIL & GAS                        |
|PHARMA                           |
+---------------------------------+



In [9]:
import org.apache.spark.sql.functions.{col, desc}

sectorDf.select(sectorDf("Industry")).distinct().sort(desc("Industry")).show()

+--------------------+
|            Industry|
+--------------------+
|              PHARMA|
|           OIL & GAS|
|              METALS|
|MEDIA ENTERTAINME...|
|                  IT|
|INDUSTRIAL MANUFA...|
| HEALTHCARE SERVICES|
|  FINANCIAL SERVICES|
|      CONSUMER GOODS|
|        CONSTRUCTION|
|          AUTOMOBILE|
+--------------------+



import org.apache.spark.sql.functions.{col, desc}


In [10]:
import org.apache.spark.sql.functions.{col, desc}
// descending order
sectorDf.select(col("Industry")).distinct().sort(desc("Industry")).show()

+--------------------+
|            Industry|
+--------------------+
|              PHARMA|
|           OIL & GAS|
|              METALS|
|MEDIA ENTERTAINME...|
|                  IT|
|INDUSTRIAL MANUFA...|
| HEALTHCARE SERVICES|
|  FINANCIAL SERVICES|
|      CONSUMER GOODS|
|        CONSTRUCTION|
|          AUTOMOBILE|
+--------------------+



import org.apache.spark.sql.functions.{col, desc}


In [11]:
import org.apache.spark.sql.functions.{col, desc}

sectorDf.select($"Industry").distinct().sort(desc("Industry")).show()

+--------------------+
|            Industry|
+--------------------+
|              PHARMA|
|           OIL & GAS|
|              METALS|
|MEDIA ENTERTAINME...|
|                  IT|
|INDUSTRIAL MANUFA...|
| HEALTHCARE SERVICES|
|  FINANCIAL SERVICES|
|      CONSUMER GOODS|
|        CONSTRUCTION|
|          AUTOMOBILE|
+--------------------+



import org.apache.spark.sql.functions.{col, desc}


In [12]:
var stockDf= spark.read
                    .format("csv")
                    .option("header", true)
                    .option("inferSchema", true)
                    .option("delimitter", ",")
                    .option("timestampFormat", "dd-MMM-yyyy")
                    .load("hdfs://localhost:9000/stocks/daily")
                    .drop("_c13")


stockDf.printSchema()
stockDf.show(2)


root
 |-- SYMBOL: string (nullable = true)
 |-- SERIES: string (nullable = true)
 |-- OPEN: double (nullable = true)
 |-- HIGH: double (nullable = true)
 |-- LOW: double (nullable = true)
 |-- CLOSE: double (nullable = true)
 |-- LAST: double (nullable = true)
 |-- PREVCLOSE: double (nullable = true)
 |-- TOTTRDQTY: integer (nullable = true)
 |-- TOTTRDVAL: double (nullable = true)
 |-- TIMESTAMP: timestamp (nullable = true)
 |-- TOTALTRADES: integer (nullable = true)
 |-- ISIN: string (nullable = true)

+----------+------+----+----+----+-----+-----+---------+---------+-------------+-------------------+-----------+------------+
|    SYMBOL|SERIES|OPEN|HIGH| LOW|CLOSE| LAST|PREVCLOSE|TOTTRDQTY|    TOTTRDVAL|          TIMESTAMP|TOTALTRADES|        ISIN|
+----------+------+----+----+----+-----+-----+---------+---------+-------------+-------------------+-----------+------------+
| 20MICRONS|    EQ|70.1|73.6|70.1|71.85|72.05|     71.2|   219912|1.583125505E7|2022-03-02 00:00:00|       2642|

stockDf: org.apache.spark.sql.DataFrame = [SYMBOL: string, SERIES: string ... 11 more fields]


In [13]:
stockDf.filter ( $"TOTTRDQTY" > 5000000 )
        .select("SYMBOL", "TOTTRDQTY").show()

+----------+---------+
|    SYMBOL|TOTTRDQTY|
+----------+---------+
|ADANIPOWER| 37990829|
|  ALOKINDS| 13696536|
| AMBUJACEM|  6623505|
|  ASHOKLEY| 13299580|
|       AWL| 13334439|
|  AXISBANK| 11691602|
|BANDHANBNK|  7537231|
|BANKBARODA| 47475131|
|       BEL| 17202697|
|BHARTIARTL| 10220908|
|      BHEL| 33734292|
|    BIOCON| 13122596|
|      BPCL|  7734602|
|     CANBK| 10972135|
| COALINDIA| 72648396|
|   CPSEETF|  6838326|
|   DEVYANI|  5606901|
|     DHANI| 37519005|
|       DLF|  6854294|
| FCONSUMER| 11838948|
+----------+---------+
only showing top 20 rows



In [14]:
stockDf.filter ( ($"TOTTRDQTY" > 5000000) && ($"TOTTRDVAL" > 100000000 ))
        .select("SYMBOL", "TOTTRDQTY", "TOTTRDVAL").show(truncate=false)

+----------+---------+-----------------+
|SYMBOL    |TOTTRDQTY|TOTTRDVAL        |
+----------+---------+-----------------+
|ADANIPOWER|37990829 |4.70183548235E9  |
|ALOKINDS  |13696536 |3.3159322015E8   |
|AMBUJACEM |6623505  |2.02519921995E9  |
|ASHOKLEY  |13299580 |1.56376581585E9  |
|AWL       |13334439 |5.1355806697E9   |
|AXISBANK  |11691602 |8.6334568352E9   |
|BANDHANBNK|7537231  |2.2247190949E9   |
|BANKBARODA|47475131 |4.91196164495E9  |
|BEL       |17202697 |3.7135653097E9   |
|BHARTIARTL|10220908 |6.87781912745E9  |
|BHEL      |33734292 |1.6971618874E9   |
|BIOCON    |13122596 |4.5562065457E9   |
|BPCL      |7734602  |2.678648484E9    |
|CANBK     |10972135 |2.36614299085E9  |
|COALINDIA |72648396 |1.313502573215E10|
|CPSEETF   |6838326  |2.2645909145E8   |
|DEVYANI   |5606901  |8.953902411E8    |
|DHANI     |37519005 |2.89349521655E9  |
|DLF       |6854294  |2.36572027835E9  |
|FEDERALBNK|15548463 |1.4921710391E9   |
+----------+---------+-----------------+
only showing top

In [15]:
stockDf = stockDf.withColumn("GAIN", $"CLOSE" - 'OPEN)
stockDf.printSchema()
stockDf.show(5)

root
 |-- SYMBOL: string (nullable = true)
 |-- SERIES: string (nullable = true)
 |-- OPEN: double (nullable = true)
 |-- HIGH: double (nullable = true)
 |-- LOW: double (nullable = true)
 |-- CLOSE: double (nullable = true)
 |-- LAST: double (nullable = true)
 |-- PREVCLOSE: double (nullable = true)
 |-- TOTTRDQTY: integer (nullable = true)
 |-- TOTTRDVAL: double (nullable = true)
 |-- TIMESTAMP: timestamp (nullable = true)
 |-- TOTALTRADES: integer (nullable = true)
 |-- ISIN: string (nullable = true)
 |-- GAIN: double (nullable = true)

+----------+------+-------+-------+-------+-------+-------+---------+---------+-------------+-------------------+-----------+------------+--------------------+
|    SYMBOL|SERIES|   OPEN|   HIGH|    LOW|  CLOSE|   LAST|PREVCLOSE|TOTTRDQTY|    TOTTRDVAL|          TIMESTAMP|TOTALTRADES|        ISIN|                GAIN|
+----------+------+-------+-------+-------+-------+-------+---------+---------+-------------+-------------------+-----------+---------

stockDf: org.apache.spark.sql.DataFrame = [SYMBOL: string, SERIES: string ... 12 more fields]


In [16]:
import org.apache.spark.sql.functions.{date_format}
stockDf= stockDf.withColumn("Year", date_format($"TIMESTAMP", "yyyy"))
        .withColumn("Month", date_format($"TIMESTAMP", "MM"))
        .withColumn("Day", date_format($"TIMESTAMP", "dd"))

stockDf.show(2)

+----------+------+----+----+----+-----+-----+---------+---------+-------------+-------------------+-----------+------------+----+----+-----+---+
|    SYMBOL|SERIES|OPEN|HIGH| LOW|CLOSE| LAST|PREVCLOSE|TOTTRDQTY|    TOTTRDVAL|          TIMESTAMP|TOTALTRADES|        ISIN|GAIN|Year|Month|Day|
+----------+------+----+----+----+-----+-----+---------+---------+-------------+-------------------+-----------+------------+----+----+-----+---+
| 20MICRONS|    EQ|70.1|73.6|70.1|71.85|72.05|     71.2|   219912|1.583125505E7|2022-03-02 00:00:00|       2642|INE144J01027|1.75|2022|   03| 02|
|21STCENMGM|    EQ|29.6|29.6|29.6| 29.6| 29.6|     30.2|     1209|      35786.4|2022-03-02 00:00:00|         45|INE253B01015| 0.0|2022|   03| 02|
+----------+------+----+----+----+-----+-----+---------+---------+-------------+-------------------+-----------+------------+----+----+-----+---+
only showing top 2 rows



import org.apache.spark.sql.functions.date_format
stockDf: org.apache.spark.sql.DataFrame = [SYMBOL: string, SERIES: string ... 15 more fields]


In [17]:
stockDf.rdd.getNumPartitions

res15: Int = 2


In [18]:
stockDf.write
        .partitionBy("Year", "Month", "Day")
        .format("parquet")
        .mode("overwrite")
        .save("hdfs://localhost:9000/stock-data")

In [19]:
val allData = spark.read.format("parquet")
                    .load("hdfs://localhost:9000/stock-data")
allData.count()

allData: org.apache.spark.sql.DataFrame = [SYMBOL: string, SERIES: string ... 15 more fields]
res17: Long = 4370


In [20]:
val allData2022 = spark.read.format("parquet")
                    .load("hdfs://localhost:9000/stock-data/Year=2022")
allData2022.count()

allData2022: org.apache.spark.sql.DataFrame = [SYMBOL: string, SERIES: string ... 14 more fields]
res18: Long = 4370


In [21]:
val allData2022Month03 = spark.read.format("parquet")
                    .load("hdfs://localhost:9000/stock-data/Year=2022/Month=03")
allData2022Month03.count()

allData2022Month03: org.apache.spark.sql.DataFrame = [SYMBOL: string, SERIES: string ... 13 more fields]
res19: Long = 4370


In [22]:
val allData2022Month03Day02 = spark.read.format("parquet")
                    .load("hdfs://localhost:9000/stock-data/Year=2022/Month=03/Day=02")
allData2022Month03Day02.count()

allData2022Month03Day02: org.apache.spark.sql.DataFrame = [SYMBOL: string, SERIES: string ... 12 more fields]
res20: Long = 2198
