In [0]:
# read the partitioned futures and options data
fo_data_location = '/mnt/fopart'
fo_df = spark.read.option("inferSchema", True)\
    .option("header", True)\
    .csv(fo_data_location)

In [0]:
# check the data
fo_df.show(2)

In [0]:
# function to convert month name to number
# copy it to the stocks_function_helper notebook
def mnameToNo(dt):
    mname = dt[3:6].upper()
    calendar = {"JAN": "01", "FEB": "02", "MAR": "03", "APR": "04",
                "MAY": "05", "JUN": "06", "JUL": "07", "AUG": "08", "SEP": "09", "OCT": "10",
                "NOV": "11", "DEC": "12"}
    return dt.upper().replace(mname, calendar[mname])

In [0]:
# create a udf from the mnameToNo function
# park this also in the stocks_function_helper notebook
udf_mname_to_no = udf(mnameToNo)

In [0]:
# we are going to load the partitoned dataset
# we are going to drop the extra column
# add on a proper time stamp column and add on year, month, day columns
# we will group by symbol, instrumnent, expiry date, option type
# and find out the aggregate contracts and total value

# So for every date for every symbil we will have the aggregates for futures, calls and puts
# and all of these we will capture in a function in the functions helper notebook
# and call the functionality from there directly

# the function is def grouped_df(input_df) and it returns back the grouped df

In [0]:
from pyspark.sql.functions import *
fodf = fo_df.drop("_c15").withColumn('rts', to_timestamp(udf_mname_to_no("TIMESTAMP"), "dd-MM-yyyy"))
# fodf.select(to_timestamp(udf_mname_to_no("TIMESTAMP"), "dd-MM-yyyy").alias('rts')).show(2)
fodf.show(2)

In [0]:
fodf_grouped_instrument = fodf.groupBy('rts', 'SYMBOL','INSTRUMENT','EXPIRY_DT','OPTION_TYP','YEAR', 'MONTH', 'DAY')\
.agg(sum('contracts').alias('Contracts'),
    sum('VAL_INLAKH').alias('VALUE'),
    sum('OPEN_INT').alias('OPEN_INT')).cache()

In [0]:
fodf_grouped_instrument.show(2)

In [0]:
# we will generate the put call ratio - puts / calls for each date from the aggregated group df
# we will pivot on option type to get option_typ three elements - XX, CE, PE as columns
# and add on total PE / total CE as the pcr - put call ratio - column
# this also we will store as a function and call it from the functions helper notebook
# the function is def add_pcr_to_df(input_df) and it returns the transformed df
# with columns for each option type and a column for the pcr

In [0]:
fodf_pcr = fodf_grouped_instrument\
.filter("INSTRUMENT != 'FUTSTK'")\
.groupBy('rts', 'SYMBOL', 'EXPIRY_DT', 'YEAR', 'MONTH', 'DAY')\
.pivot('OPTION_TYP', ['XX','PE','CE'])\
.agg(sum('Contracts').alias('contracts'))\
.withColumn('pcr', col('PE')/col('CE')).cache()

In [0]:
fodf_pcr.show(2)

In [0]:
# finally, we will combine the two dataframes and have the grouped data frame
# with the pcr available for each row
# and let us put this functionaliy in a function
# def combine_grouped_and_pcr_dfs(grouped_df, pcr_df):
# which will return back the combined dataframe as generated below

fodf_processed = fodf_grouped_instrument.alias('fgi').join(fodf_pcr.alias('pcr'),['rts', 'SYMBOL', 'EXPIRY_DT'])\
.select('rts','SYMBOL', 'EXPIRY_DT', 'INSTRUMENT', 'OPTION_TYP', 'CONTRACTS', 'VALUE', 'OPEN_INT', 'PCR','fgi.YEAR', 'fgi.MONTH', 'fgi.DAY').cache()

In [0]:
fodf_processed.show(2)

In [0]:
print(spark.conf.get('spark.sql.shuffle.partitions'))
spark.conf.set('spark.sql.shuffle.partitions',4)

In [0]:
# and a functtion to write the output 
# def write_processed_output(processed_df,output_path,output_mode,table_name):
# the parameter names should be explanatory and can be seen from the command below

# we will write the processed table as a partitioned table - partitioned by year, month, day
# so to carry out corrective processing, we will delete that partition
# and insert back the  processed partition

spark.conf.set('spark.sql.shuffle.partitions','4')
fodf_processed.write.mode('overwrite')\
.partitionBy('YEAR','MONTH', 'DAY')\
.option('path', '/mnt/foprocessed')\
.format('parquet')\
.saveAsTable('FOPR')

In [0]:
%sql
DROP TABLE IF EXISTS FOTABLEN;
CREATE TABLE IF NOT EXISTS FOTABLEN(
INSTRUMENT   string,
    SYMBOL   string,
 EXPIRY_DT   string,
 STRIKE_PR   double,
OPTION_TYP   string,
      OPEN   double,
      HIGH   double,
       LOW   double,
     CLOSE   double,
 SETTLE_PR   double,
 CONTRACTS   double,
VAL_INLAKH   double,
  OPEN_INT   double,
 CHG_IN_OI   double,
 TIMESTAMP   string,
 BLANKCOL   string)
 USING CSV
 PARTITIONED BY (
      year      int,
     month      int,
       day      int
 ) 
 OPTIONS (path "/mnt/fopart",
        delimiter ",",
        header "true");
SELECT * FROM FOTABLEN LIMIT 5;

INSTRUMENT,SYMBOL,EXPIRY_DT,STRIKE_PR,OPTION_TYP,OPEN,HIGH,LOW,CLOSE,SETTLE_PR,CONTRACTS,VAL_INLAKH,OPEN_INT,CHG_IN_OI,TIMESTAMP,BLANKCOL,year,month,day


In [0]:
%sql
-- we will create a table linked to the tree directory structure
-- to replace a day's data we just need to replace the file
DROP TABLE IF EXISTS FOTABLE;
CREATE TABLE IF NOT EXISTS FOTABLE(
INSTRUMENT   string,
    SYMBOL   string,
 EXPIRY_DT   string,
 STRIKE_PR   double,
OPTION_TYP   string,
      OPEN   double,
      HIGH   double,
       LOW   double,
     CLOSE   double,
 SETTLE_PR   double,
 CONTRACTS   double,
VAL_INLAKH   double,
  OPEN_INT   double,
 CHG_IN_OI   double,
 TIMESTAMP   string,
 BLANKCOL   string,
      year      int,
     month      int,
       day      int
 ) using csv
 OPTIONS (path "/mnt/fopart",
        delimiter ",",
        header "true");
SELECT COUNT(*) FROM FOTABLE WHERE YEAR = 2018 AND MONTH = 1 AND DAY = 1;

count(1)
43095


In [0]:
%sql
-- if we have to update some source data - we got revised data, data came late
-- then we can put it in its proper location and refresh the table
-- to see this in action drop a file, refresh, add back that file, refresh
REFRESH TABLE FOTABLE;
SELECT COUNT(*) FROM FOTABLE WHERE YEAR = 2018 AND MONTH = 1 AND DAY = 1;

count(1)
43095


In [0]:
%sql 
show tables

database,tableName,isTemporary
default,fopr,False
default,fotable,False
default,fotablen,False


In [0]:
%sql
show partitions fopr

YEAR,MONTH,DAY
2018,1,2
2018,1,23
2018,1,30
2018,1,8
2018,10,12
2018,10,24
2018,10,29
2018,10,30
2018,11,13
2018,11,16
