In [1]:
import pandas as pd
import numpy as np
from scipy.stats import skew, kurtosis
import matplotlib.pyplot as plt
from datetime import timedelta
from polygon import RESTClient
import requests
from pandas.tseries.offsets import BDay

### 1. Collect data for the last 14 years - from 2010 to 2023 in daily granularity.
### 2. Identify top 10 outliers in each year.

In [15]:
def calculate_daily_returns(df, prev_close=None):
    if prev_close is not None:
        df.loc[df.index[0], 'prev_close'] = prev_close
    else:
        df['prev_close'] = df['c'].shift(1)
    df['daily_return'] = (df['c'] - df['prev_close']) / df['prev_close']
    df['average_daily_return'] = df['daily_return'].rolling(window=2, min_periods=1).mean()
    df['abs_daily_return'] = df['average_daily_return'].abs()
    return df

def get_top_outliers(df, n=10):
    return df.nlargest(n, 'abs_daily_return')

# Define the API key and base URL
api_key = 'beBybSi8daPgsTp5yx5cHtHpYcrjp5Jq'

# Define the currency pairs and years
pair = "C:USDEUR"
years = range(2010, 2024)

# Initialize DataFrames
stats_data = pd.DataFrame(columns=['Year', 'Mean', 'Variance', 'Skewness', 'Kurtosis'])
full_data = pd.DataFrame()
outliers_data = pd.DataFrame()

# Loop over each year
for year in years:
    # Format the API endpoint
    start_date = f'{year}-01-01'
    end_date = f'{year}-12-31'
    url = f"https://api.polygon.io/v2/aggs/ticker/{pair}/range/1/day/{start_date}/{end_date}?adjusted=true&sort=asc&limit=50000&apiKey={api_key}"
    
    # Make the API request
    response = requests.get(url)
    data = response.json()
    
    # Check if the request was successful
    if response.status_code == 200 and 'results' in data:
        # Load data into a DataFrame
        df = pd.DataFrame(data['results'])
        # Convert timestamps
        df['date'] = pd.to_datetime(df['t'], unit='ms')
        df.drop(columns=['t'], inplace=True)

        df = df[df['date'].dt.weekday < 5]
        # Calculating returns
        df = calculate_daily_returns(df)

        # Append the data to the full_data DataFrame for the current currency pair
        df['year'] = year
        df['day'] = df['date'].dt.day_name()
        # Find the top 10 outliers based on absolute values of the daily return value
        # df['abs_daily_return'] = df['c'].abs()
        top_10_outliers = df.nlargest(10, 'abs_daily_return')
        outlier_dates = top_10_outliers['date']

        # Create a new column 'is_outlier' in the full_data DataFrame
        df['is_outlier'] = df['date'].isin(outlier_dates).astype(int)
        full_data = pd.concat([full_data, df], ignore_index=True)
        
        # Append outliers to the outliers_data DataFrame for the current currency pair
        top_10_outliers['year'] = year
        outliers_data = pd.concat([outliers_data, top_10_outliers], ignore_index=True)
        

sorted_full_data = full_data.sort_values(by="date")
sorted_outliers_data = outliers_data.sort_values(by="date")

In [32]:
sorted_full_data

Unnamed: 0,v,vw,o,c,h,l,n,date,prev_close,daily_return,average_daily_return,abs_daily_return,year,day,is_outlier
0,764,0.7609,0.75720,0.76110,0.76430,0.757100,764,2012-01-30,,,,,2012,Monday,0
1,750,0.7608,0.76100,0.76460,0.76620,0.756800,750,2012-01-31,0.76110,0.004599,0.004599,0.004599,2012,Tuesday,0
2,781,0.7618,0.76470,0.75920,0.76760,0.756800,781,2012-02-01,0.76460,-0.007063,-0.001232,0.001232,2012,Wednesday,0
3,767,0.7602,0.75920,0.76120,0.76370,0.757800,767,2012-02-02,0.75920,0.002634,-0.002214,0.002214,2012,Thursday,0
4,692,0.7610,0.76130,0.75970,0.76460,0.758300,692,2012-02-03,0.76120,-0.001971,0.000332,0.000332,2012,Friday,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3029,6522,0.9078,0.90750,0.90747,0.90940,0.901600,6522,2023-12-25,0.90690,0.000629,-0.000610,0.000610,2023,Monday,0
3030,51232,0.9068,0.90747,0.90550,0.90833,0.905100,51232,2023-12-26,0.90747,-0.002171,-0.000771,0.000771,2023,Tuesday,0
3031,77389,0.9029,0.90557,0.90018,0.90670,0.898796,77389,2023-12-27,0.90550,-0.005875,-0.004023,0.004023,2023,Wednesday,0
3032,83788,0.9008,0.90018,0.90348,0.90453,0.897500,83788,2023-12-28,0.90018,0.003666,-0.001105,0.001105,2023,Thursday,0


In [53]:
sorted_full_data.to_csv("/Users/nachiketkhare/Downloads/full_data_filtered.csv")

#### New Analysis


In [92]:
# Tolerance level for "very close" comparison (e.g., 1% difference)
tolerance = 0.005
max_days = 10

# DataFrame to store results
recovery_data = pd.DataFrame(columns=[
    'Outlier Date', 'Outlier Close Price', 'Reference Point', 'Outlier Daily Return', 'Recovery Date', 'Recovery Close Price', 
    'Days to Recovery', 'Year', 'Reversal Date', 'Reversal Daily Return'
])

# Step 1: Identify outliers
outliers = sorted_full_data[sorted_full_data['is_outlier'] == 1]
outliers = outliers[~((outliers['c'] < 0.8) & (outliers['daily_return'] > 0))]
outliers = outliers[~((outliers['c'] > 0.95) & (outliers['daily_return'] < 0))]

remaining_outlier_dates = outliers['date']

sorted_full_data['is_outlier'] = sorted_full_data['date'].isin(remaining_outlier_dates).astype(int)

filtered_outliers = sorted_full_data[sorted_full_data['is_outlier'] == 1]

# Step 2: Iterate over each outlier to find the recovery date
for _, outlier in filtered_outliers.iterrows():
    outlier_date = outlier['date']
 
    outlier_index = sorted_full_data.index[sorted_full_data['date'] == outlier_date].tolist()[0]

    #Volatility Cluster
    is_prev_day_outlier = sorted_full_data.iloc[outlier_index - 1]['is_outlier']
    is_next_day_outlier = sorted_full_data.iloc[outlier_index + 1]['is_outlier']

    
    if outlier_index > 0:
        if is_next_day_outlier == 1:
            reference_point_price = sorted_full_data.iloc[outlier_index]['c']
        else:
            reference_point_price = sorted_full_data.iloc[outlier_index - 1]['c']
    else:
        # Handle the case where the outlier is the first row
        continue
    

    outlier_daily_return = outlier['daily_return']
    outlier_return_sign = np.sign(outlier_daily_return)
    
    outlier_close = outlier['c']
    year = outlier['year']
    
    # Step 3: Search for the first date after the outlier date where the close price is similar
    future_data = sorted_full_data[(sorted_full_data['date'] > outlier_date)]

    # Limit the search to the next 'max_days' business days
    future_data = future_data.head(max_days)
    
    similar_price_data = future_data[np.abs(future_data['c'] - reference_point_price) <= tolerance * outlier_close]
    
    # Initialize variables for recovery and reversal information
    recovery_date = None
    recovery_daily_return = None
    reversal_date = None
    reversal_daily_return = None
    days_to_recovery = None
    
    #setting the recovery date for volitily cluster outlier
    if is_next_day_outlier == 1:

        for i in range(len(future_data)):
            current_day = future_data.iloc[i]
            current_daily_return = current_day['daily_return']
            current_return_sign = np.sign(current_daily_return)

            if current_return_sign != outlier_return_sign:
                reversal_date = current_day['date']
                reversal_daily_return = current_day['daily_return']
                recovery_date = reversal_date # Set recovery date as the reversal date
                days_to_recovery = len(pd.bdate_range(start=outlier_date, end=current_day['date']))
                recovery_close_price = current_day['c']
                break
                
    elif is_prev_day_outlier == 1:
        # print(f"outlier_date {outlier_date}")
        for i in range(len(future_data)):
            current_day = future_data.iloc[i]
            current_daily_return = current_day['daily_return']
            # print(f"current daily return {current_daily_return}")
            current_return_sign = np.sign(current_daily_return)

            if current_return_sign != outlier_return_sign:
                reversal_date = current_day['date']
                # print(f"reversal_date {reversal_date}")
                reversal_daily_return = current_day['daily_return']
                recovery_date = reversal_date # Set recovery date as the reversal date
                # print(recovery_date)
                days_to_recovery = len(pd.bdate_range(start=outlier_date, end=current_day['date']))
                recovery_close_price = current_day['c']
                break
        
    if is_next_day_outlier != 1 and is_prev_day_outlier != 1:
        # Step 4: Find a valid recovery date (must be more than 2 days)
        for i in range(len(similar_price_data)):
            current_day = similar_price_data.iloc[i]
            
            days_to_recovery = len(pd.bdate_range(start=outlier_date, end=current_day['date']))
            
            recovery_days = len(future_data[future_data['date'] <= current_day['date']])
            recovery_date = current_day['date']
            recovery_close_price = current_day['c']
        
        # If a valid recovery date is found, search for the reversal point within the recovery period
        if recovery_date is not None:

            # print(outlier_return_sign)
            for i in range(len(future_data)):
                current_day = future_data.iloc[i]
                
                current_daily_return = current_day['daily_return']
                # print(current_daily_return)
                
                current_return_sign = np.sign(current_daily_return)
                
                # Check if the direction has reversed
                if current_return_sign != outlier_return_sign:
                    reversal_date = current_day['date']
                    reversal_daily_return = current_daily_return
                    break
                    
                # Ensure the current day is within the recovery period
                if current_day['date'] > recovery_date:
                    break
            
            
    
    # If no valid recovery found, set to the max_days_date
    if recovery_date is None:
            
        # Ensure there are at least 10 rows after the outlier date
        if outlier_index + 10 < len(sorted_full_data):
            max_days_date = sorted_full_data.iloc[outlier_index + 10]['date']
            max_days_close_price = sorted_full_data.iloc[outlier_index + 10]['c']
        else:
            # Handle the case where there are fewer than 10 rows after the outlier date
            max_days_date = sorted_full_data.iloc[-1]['date']  # Last available date
            max_days_close_price = sorted_full_data.iloc[-1]['c']  # Last available close price

        reversal_date = None
        reversal_daily_return = None

        # Iterate through the period up to `max_days_date`
        for i in range(outlier_index + 1, sorted_full_data.index[sorted_full_data['date'] == max_days_date][0] + 1):
            current_daily_return = sorted_full_data.iloc[i]['daily_return']
            current_return_sign = np.sign(current_daily_return)

            # Check if the sign is opposite
            if current_return_sign != outlier_return_sign:
                # print("here")
                reversal_date = sorted_full_data.iloc[i]['date']
                reversal_daily_return = sorted_full_data.iloc[i]['daily_return']
                break
                
        recovery_close_price = max_days_close_price
        recovery_date = max_days_date
        days_to_recovery = max_days
    
    # Store the recovery and reversal information
    recovery_info = {
        'Outlier Date': outlier_date,
        'Outlier Close Price': outlier_close,
        'Outlier Daily Return': outlier_daily_return,
        'Reference Point': reference_point_price,
        'Recovery Date': recovery_date,
        'Recovery Close Price': recovery_close_price,
        'Days to Recovery': days_to_recovery,
        'Year': year,
        'Reversal Date': reversal_date,
        'Reversal Daily Return': reversal_daily_return
    }
    
    # Append the result to the recovery_data DataFrame
    recovery_data = pd.concat([recovery_data, pd.DataFrame([recovery_info])], ignore_index=True)

# Display the recovery data
# print(recovery_data)

  recovery_data = pd.concat([recovery_data, pd.DataFrame([recovery_info])], ignore_index=True)
  recovery_data = pd.concat([recovery_data, pd.DataFrame([recovery_info])], ignore_index=True)


In [31]:
outliers.to_csv("/Users/nachiketkhare/Downloads/outliers.csv")

In [94]:
recovery_data.to_csv("/Users/nachiketkhare/Downloads/outlier_recovery_data.csv")

In [93]:
# recovery_data_test.to_csv("/Users/nachiketkhare/Downloads/outlier_recovery_data_test.csv")
recovery_data

Unnamed: 0,Outlier Date,Outlier Close Price,Reference Point,Outlier Daily Return,Recovery Date,Recovery Close Price,Days to Recovery,Year,Reversal Date,Reversal Daily Return
0,2012-02-24,0.74320,0.74770,-0.006018,2012-03-01,0.75060,5,2012,2012-02-27,0.003633
1,2012-06-29,0.79000,0.80380,-0.017168,2012-07-05,0.80700,5,2012,2012-07-02,0.006076
2,2012-07-05,0.80700,0.80700,0.010139,2012-07-09,0.81180,3,2012,2012-07-09,-0.002335
3,2012-07-06,0.81370,0.80700,0.008302,2012-07-09,0.81180,2,2012,2012-07-09,-0.002335
4,2012-07-26,0.81430,0.81430,-0.011412,2012-07-30,0.81560,3,2012,2012-07-30,0.004929
...,...,...,...,...,...,...,...,...,...,...
98,2023-03-17,0.92990,0.94215,-0.013002,2023-03-31,0.91980,10,2023,2023-03-20,0.003237
99,2023-05-09,0.91198,0.90920,0.003058,2023-05-10,0.91046,2,2023,2023-05-10,-0.001667
100,2023-07-13,0.89096,0.89759,-0.007386,2023-07-21,0.89810,7,2023,2023-07-17,0.000956
101,2023-11-14,0.91925,0.93442,-0.016235,2023-11-28,0.90871,10,2023,2023-11-15,0.002013


In [44]:
# # Tolerance level for "very close" comparison (e.g., 1% difference)
# tolerance = 0.005
# max_days = 20

# # DataFrame to store results
# recovery_data_1 = pd.DataFrame(columns=[
#     'Outlier Date', 'Reference Point', 'Outlier Daily Return', 'Recovery Date', 'Recovery Close Price', 
#     'Days to Recovery', 'Year', 'Reversal Day', 'Reversal Date', 'Reversal Daily Return'
# ])

# # Step 1: Identify outliers
# outliers = sorted_full_data[sorted_full_data['is_outlier'] == 1]

# # Step 2: Iterate over each outlier to find the recovery date
# for _, outlier in outliers.iterrows():
#     outlier_date = outlier['date']
    
#     outlier_index = sorted_full_data.index[sorted_full_data['date'] == outlier_date].tolist()[0]

#     if outlier_index > 0:
#         reference_point_price = sorted_full_data.iloc[outlier_index - 1]['c']
#     else:
#         # Handle the case where the outlier is the first row
#         continue
    
#     # print(reference_point_price)

#     outlier_daily_return = outlier['daily_return']
#     outlier_daily_avg_return = outlier['average_daily_return']
#     outlier_return_sign = np.sign(outlier_daily_return)
    
#     outlier_close = outlier['c']
#     year = outlier['year']
    
#     # Step 3: Search for the first date after the outlier date where the close price is similar
#     future_data = sorted_full_data[(sorted_full_data['date'] > outlier_date)]

#     # Limit the search to the next 'max_days' business days
#     future_data = future_data.head(max_days)
    
#     similar_price_data = future_data[np.abs(future_data['c'] - reference_point_price) <= tolerance * outlier_close]
    
#     # Initialize variables for recovery and reversal information
#     recovery_date = None
#     recovery_daily_return = None
#     days_to_recovery = None
    
    
#     # Step 4: Find a valid recovery date (must be more than 2 days)
#     for i in range(len(similar_price_data)):
#         current_day = similar_price_data.iloc[i]
        
#         days_to_recovery = len(pd.bdate_range(start=outlier_date, end=current_day['date']))
#         # print(len(pd.bdate_range(start=outlier_date, end=current_day['date'])))
#         # Check if the recovery period is more than 2 days
#         if days_to_recovery > 2:
#             recovery_days = len(future_data[future_data['date'] <= current_day['date']])
#             recovery_date = current_day['date']
#             recovery_close_price = current_day['c']
#             break
    
#     # If a valid recovery date is found, search for the reversal point within the recovery period
#     if recovery_date is not None:

#         reversal_date = None
#         reversal_daily_return = None
        
#         for i in range(1, len(future_data)):
#             current_day = future_data.iloc[i]
#             current_daily_avg_return = current_day['daily_return']
#             current_return_sign = np.sign(current_daily_avg_return)

#             # print(outlier_date)
#             # print(f"outlier sign {outlier_return_sign}")
#             # print(f"current sign {current_return_sign}")
            
#             # Check if the direction has reversed
#             if current_return_sign != outlier_return_sign:
#                 reversal_date = current_day['date']
#                 reversal_day = len(pd.bdate_range(start=outlier_date, end=current_day['date']))
#                 reversal_daily_return = current_daily_avg_return
#                 break
                
#             # Ensure the current day is within the recovery period
#             if current_day['date'] > recovery_date:
#                 break
            
            
    
#     # If no valid recovery found, set to the max_days_date
#     if recovery_date is None:
        
#         # Ensure there are at least 10 rows after the outlier date
#         if outlier_index + 20 < len(sorted_full_data):
#             max_days_date = sorted_full_data.iloc[outlier_index + 20]['date']
#             max_days_close_price = sorted_full_data.iloc[outlier_index + 20]['c']
#         else:
#             # Handle the case where there are fewer than 10 rows after the outlier date
#             max_days_date = sorted_full_data.iloc[-1]['date']  # Last available date
#             max_days_close_price = sorted_full_data.iloc[-1]['c']  # Last available close price

#         reversal_date = None
#         reversal_daily_return = None
        

#         # Iterate through the period up to `max_days_date`
#         for i in range(outlier_index + 1, sorted_full_data.index[sorted_full_data['date'] == max_days_date][0] + 1):
#             current_avg_return = sorted_full_data.iloc[i]['daily_return']
#             current_return_sign = np.sign(current_avg_return)

#             # print(outlier_date)
#             # print(f"outlier sign {outlier_return_sign}")
#             # print(f"current sign {current_return_sign}")
#             # Check if the sign is opposite
#             if current_return_sign != outlier_return_sign:
#                 # print("here")
#                 reversal_date = sorted_full_data.iloc[i]['date']
#                 reversal_day = None
#                 reversal_daily_return = sorted_full_data.iloc[i]['daily_return']
#                 break
                
#         recovery_close_price = max_days_close_price
#         recovery_date = max_days_date
#         days_to_recovery = max_days
    
#     # Store the recovery and reversal information
#     recovery_info = {
#         'Outlier Date': outlier_date,
#         'Reference Point': reference_point_price,
#         'Outlier Daily Return': outlier_daily_return,
#         'Recovery Date': recovery_date,
#         'Recovery Close Price': recovery_close_price,
#         'Days to Recovery': days_to_recovery,
#         'Year': year,
#         'Reversal Day': reversal_day,
#         'Reversal Date': reversal_date,
#         'Reversal Daily Return': reversal_daily_return
#     }
    
#     # Append the result to the recovery_data DataFrame
#     recovery_data_1 = pd.concat([recovery_data_1, pd.DataFrame([recovery_info])], ignore_index=True)

# # Display the recovery data
# print(recovery_data_1)

In [200]:


# # Tolerance level for "very close" comparison (e.g., 1% difference)
# tolerance = 0.0048
# max_days = 10

# # DataFrame to store results
# recovery_data_test = pd.DataFrame(columns=[
#     'Outlier Date', 'Outlier Close Price', 'Recovery Date', 'Recovery Close Price', 
#     'Days to Recovery', 'Year', 'Reversal Date', 'Reversal Price'
# ])

# # Step 1: Identify outliers
# outliers = sorted_full_data[sorted_full_data['is_outlier'] == 1]

# # Step 2: Iterate over each outlier to find the recovery date
# for _, outlier in outliers.iterrows():
#     outlier_date = outlier['date']
#     outlier_close = outlier['c']
#     year = outlier['year']
    
#     # Step 3: Search for the first date after the outlier date where the close price is similar
#     future_data = sorted_full_data[(sorted_full_data['date'] > outlier_date)]

#     # Limit the search to the next 'max_days' business days
#     future_data = future_data.head(max_days)
    
#     similar_price_data = future_data[np.abs(future_data['c'] - outlier_close) <= tolerance * outlier_close]

#     # Step 4: Identify the initial direction
#     initial_day = future_data.iloc[0]
#     initial_direction = initial_day['c'] - outlier_close
    
#     # Step 5: Find the reversal point
#     reversal_date = None
#     reversal_price = None

#     # Modified part to avoid 1-day recoveries
#     recovery_date = None
#     recovery_close_price = None
#     days_to_recovery = None    
    
#     for i in range(len(similar_price_data)):
#         current_day = similar_price_data.iloc[i]
#         days_to_recovery = (current_day['date'] - outlier_date).days
        
#         # Check if the recovery period is more than 1 day
#         if days_to_recovery > 3:
#             recovery_date = current_day['date']
#             recovery_close_price = current_day['c']
#             break
        
#     for i in range(1, len(future_data)):
#         current_day = future_data.iloc[i]
#         price_change = current_day['c'] - future_data.iloc[i-1]['c']

#         # # Ensure the current day is within the recovery period
#         # if current_day['date'] > recovery_date:
#         #     break
        
#         # Check if the direction has reversed
#         if (initial_direction > 0 and price_change < 0) or (initial_direction < 0 and price_change > 0):
#             reversal_date = current_day['date']
#             reversal_price = current_day['c']
#             break
            
#     # If no valid recovery found, set to the max_days_date
#     if recovery_date is None:
#         max_days_date = outlier_date + BDay(max_days)
#         max_days_close_price = sorted_full_data.loc[sorted_full_data['date'] == max_days_date, 'c'].values[0]
#         recovery_info = {
#             'Outlier Date': outlier_date,
#             'Outlier Close Price': outlier_close,
#             'Recovery Date': max_days_date,
#             'Recovery Close Price': max_days_close_price,
#             'Days to Recovery': max_days,
#             'Year': year,
#             'Reversal Date': reversal_date,
#             'Reversal Price': reversal_price
#         }
#     else:
#         recovery_info = {
#             'Outlier Date': outlier_date,
#             'Outlier Close Price': outlier_close,
#             'Recovery Date': recovery_date,
#             'Recovery Close Price': recovery_close_price,
#             'Days to Recovery': days_to_recovery,
#             'Year': year,
#             'Reversal Date': reversal_date,
#             'Reversal Price': reversal_price
#         }
    
#     # Append the result to the recovery_data DataFrame
#     recovery_data_test = pd.concat([recovery_data_test, pd.DataFrame([recovery_info])], ignore_index=True)

# # Display the recovery data
# print(recovery_data_test)
