# Exploratory Data Analysis

Given data has a few missing points. It also has IV values for non-market hours and holidays. I have cleaned the data and added approximate values for missing points.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
df = pd.read_parquet('data.parquet')

In [2]:
df.index = pd.to_datetime(df.index)
# Removing Weekends and Non-Market Hours
filtered_df = df[
    (df.index.dayofweek < 5) &  
    (df.index.time >= pd.to_datetime("09:15").time()) &  
    (df.index.time <= pd.to_datetime("15:30").time())
]

In [3]:
# Checking for missing data
if filtered_df.isnull().any().any():
    print("There are missing values in the DataFrame.")
else:
    print("No missing values found in the DataFrame.")

# Calculating number of missing values for all columns
missing_data = filtered_df.isnull().sum()

# Printing number of missing values for all columns
print(missing_data)

There are missing values in the DataFrame.
banknifty    267
nifty        350
tte            0
dtype: int64


In [4]:
# Created a function which would fill empty cells with the average value of IV before and after the cell
# This assumption came from an observation of the dataset
def fill_missing_with_avg(df):
    for column in df.columns:
        for i, index_value in enumerate(df.index):
            if pd.isna(df.loc[index_value, column]):
                above = df.loc[df.index[i - 1], column] if i > 0 else None
                below = df.loc[df.index[i + 1], column] if i < len(df) - 1 else None
                if pd.notna(above) and pd.notna(below):
                    df.loc[index_value, column] = (above + below) / 2
                elif pd.notna(above):
                    df.loc[index_value, column] = above
                elif pd.notna(below):
                    df.loc[index_value, column] = below
                # If both above and below are NaN, the missing value remains NaN
    return df

filled_df = fill_missing_with_avg(filtered_df)

In [5]:
# Checking if there is missing data now, since there was a case where both values above and below would be NaN, would leave NaN
if filled_df.isnull().any().any():
    print("There are missing values in the DataFrame.")
else:
    print("No missing values found in the DataFrame.")

missing_data = filled_df.isnull().sum()

print(missing_data)

No missing values found in the DataFrame.
banknifty    0
nifty        0
tte          0
dtype: int64


In [7]:
df = filled_df.copy(deep=True)
df['Spread'] = df['banknifty'] - df['nifty']

In [8]:
def z_score(data, period=1875):
  rol_mean = data.rolling(period).mean()
  rol_std = data.rolling(period).std()
  z = (data-rol_mean)/rol_std
  return z

df['z-score'] = z_score(df['Spread']) # calculating z-score using a rolling window of 5 days

# Z - Score Model

The P/L formula we were given in assignment was
P/L = (Bank Nifty IV - Nifty IV)* (Time to Expiry)**0.7
However, at all times in the dataframe, Bank Nifty IV is greater than Nifty IV. P/L was always positive.
In such a case, there is no scope for a strategy. I have applied certain assumptions to make this a bit fair trading strategy.

We are calculating the z-score of spread and applying mean reversion strategy to 

In [None]:
z_scores = df['z-score']
spreads = df['Spread']
ttes = df['tte']
times = df.index.values
signal_type = []#
entry_time_new = []#
exit_time_new = []#
spread_for_entry_new = []#
spread_for_exit_new = []#
pnl_new = []

number_of_long_trades = 0
number_of_short_trades = 0

long = False
short = False
z_threshold = 1
for index, z_score in enumerate(z_scores):
    if z_score<-1*z_threshold and long is False:
        #print("entering long")
        number_of_long_trades += 1
        long = True
        signal_type.append(1)
        entry_time_new.append(times[index])
        spread_for_entry_new.append(spreads[index])
        print("entering long at ",spreads[index])

    if z_score<-1*z_threshold and long is True:
        #print("continuing long")
        pass

    if z_score>0 and long is True:
        #print("exiting long")
        long = False
        spread_for_exit_new.append(spreads[index])
        exit_spread_long = spreads[index]
        #print("we added ")
        #print("exiting long at ",spreads[index])
        entry_spread_long = spread_for_entry_new[-1]
        tte_long = ttes[index]
        pnl_long = (exit_spread_long-entry_spread_long)*(ttes[index])**(0.7)
        exit_time_new.append(times[index])
        pnl_new.append(pnl_long)
    
    if z_score>z_threshold and short is False:
        #print("entering short")
        number_of_short_trades += 1
        short = True
        signal_type.append(-1)
        entry_time_new.append(times[index])
        spread_for_entry_new.append(spreads[index])

    if z_score>z_threshold and short is True:
        #print("continuing short")
        pass

    if z_score<0 and short is True:
        #print("exiting short")
        short = False
        exit_spread_short = spreads[index]
        entry_spread_short = spread_for_entry_new[-1]
        spread_for_exit_new.append(exit_spread_short)
        exit_time_new.append(times[index])
        tte_long = ttes[index]
        pnl_short = (entry_spread_short-exit_spread_short)*(ttes[index])**(0.7)
        pnl_new.append(pnl_short)
        
losses_in_long_count = sum(1 for return_long in long_returns_list if return_long < 0)
losses_in_short_count = sum(1 for return_short in short_returns_list if return_short < 0)


signal_type_modified = ['Long' if signal == 1 else 'Short' for signal in signal_type]
final_df = pd.DataFrame({
    'Time_entry': entry_time_new,
    'Time_exit': exit_time_new,
    'Entry_spread': spread_for_entry_new,
    'Exit_spread': spread_for_exit_new,
    'PnL': pnl_new,
    'Signal': signal_type_modified
})

        