In [16]:
import pandas as pd
import numpy as np
from scipy.stats import skew, kurtosis
import matplotlib.pyplot as plt
from datetime import timedelta
from polygon import RESTClient
import requests
from pandas.tseries.offsets import BDay

### 1. Collect data for the last 14 years - from 2010 to 2023 in daily granularity.
### 2. Identify top 10 outliers in each year.

In [113]:
def calculate_daily_returns(df, prev_close=None):
    if prev_close is not None:
        df.loc[df.index[0], 'prev_close'] = prev_close
    else:
        df['prev_close'] = df['c'].shift(1)
    df['daily_return'] = (df['c'] - df['prev_close']) / df['prev_close']
    df['average_daily_return'] = df['daily_return'].rolling(window=4, min_periods=1).mean()
    df['abs_daily_return'] = df['average_daily_return'].abs()
    return df

def get_top_outliers(df, n=10):
    return df.nlargest(n, 'abs_daily_return')

# Define the API key and base URL
api_key = 'beBybSi8daPgsTp5yx5cHtHpYcrjp5Jq'

# Define the currency pairs and years
pair = "C:USDCHF"
years = range(2010, 2024)

# Initialize DataFrames
stats_data = pd.DataFrame(columns=['Year', 'Mean', 'Variance', 'Skewness', 'Kurtosis'])
full_data = pd.DataFrame()
outliers_data = pd.DataFrame()

# Loop over each year
for year in years:
    # Format the API endpoint
    start_date = f'{year}-01-01'
    end_date = f'{year}-12-31'
    url = f"https://api.polygon.io/v2/aggs/ticker/{pair}/range/1/day/{start_date}/{end_date}?adjusted=true&sort=asc&limit=50000&apiKey={api_key}"
    
    # Make the API request
    response = requests.get(url)
    data = response.json()
    
    # Check if the request was successful
    if response.status_code == 200 and 'results' in data:
        # Load data into a DataFrame
        df = pd.DataFrame(data['results'])
        # Convert timestamps
        df['date'] = pd.to_datetime(df['t'], unit='ms')
        df.drop(columns=['t'], inplace=True)

        df = df[df['date'].dt.weekday < 5]
        # Calculating returns
        df = calculate_daily_returns(df)

        # Append the data to the full_data DataFrame for the current currency pair
        df['year'] = year
        df['day'] = df['date'].dt.day_name()
        # Find the top 10 outliers based on absolute values of the daily return value
        # df['abs_daily_return'] = df['c'].abs()
        top_10_outliers = df.nlargest(10, 'abs_daily_return')
        outlier_dates = top_10_outliers['date']

        # Create a new column 'is_outlier' in the full_data DataFrame
        df['is_outlier'] = df['date'].isin(outlier_dates).astype(int)
        full_data = pd.concat([full_data, df], ignore_index=True)
        
        # Append outliers to the outliers_data DataFrame for the current currency pair
        top_10_outliers['year'] = year
        outliers_data = pd.concat([outliers_data, top_10_outliers], ignore_index=True)
        

sorted_full_data = full_data.sort_values(by="date")
sorted_outliers_data = outliers_data.sort_values(by="date")

In [114]:
sorted_full_data

Unnamed: 0,v,vw,o,c,h,l,n,date,prev_close,daily_return,average_daily_return,abs_daily_return,year,day,is_outlier
0,69913,1.0329,1.03606,1.02891,1.041960,1.02621,69913,2010-01-04,,,,,2010,Monday,0
1,76822,1.0304,1.02900,1.03341,1.035960,1.02515,76822,2010-01-05,1.02891,0.004374,0.004374,0.004374,2010,Tuesday,0
2,81057,1.0321,1.03341,1.02746,1.037830,1.02480,81057,2010-01-06,1.03341,-0.005758,-0.000692,0.000692,2010,Wednesday,0
3,72488,1.0321,1.02744,1.03370,1.037050,1.02414,72488,2010-01-07,1.02746,0.006073,0.001563,0.001563,2010,Thursday,0
4,73991,1.0311,1.03375,1.02325,1.038370,1.02151,73991,2010-01-08,1.03370,-0.010109,-0.001355,0.001355,2010,Friday,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3568,12916,0.8563,0.85534,0.85581,0.858300,0.85380,12916,2023-12-25,0.85570,0.000129,-0.001385,0.001385,2023,Monday,0
3569,112192,0.8555,0.85600,0.85372,0.857960,0.85280,112192,2023-12-26,0.85581,-0.002442,-0.002545,0.002545,2023,Tuesday,0
3570,174820,0.8495,0.85359,0.84250,0.854870,0.84060,174820,2023-12-27,0.85372,-0.013142,-0.004033,0.004033,2023,Wednesday,0
3571,196178,0.8392,0.84240,0.84337,0.845145,0.83300,196178,2023-12-28,0.84250,0.001033,-0.003606,0.003606,2023,Thursday,0


In [67]:
sorted_full_data.to_csv("/Users/nachiketkhare/Downloads/full_data.csv")

In [144]:
def find_market_entry(future_data, recovery_date, outlier_close):
    reversal_date = None
    reversal_price = None  

    # Step 4: Identify the initial direction
    initial_day = future_data.iloc[0]
    initial_direction = initial_day['c'] - outlier_close
    
    for i in range(1, len(future_data)):
        current_day = future_data.iloc[i]
        price_change = current_day['c'] - future_data.iloc[i-1]['c']
    
        # Ensure the current day is within the recovery period
        if current_day['date'] > recovery_date:
            break
        
        # Check if the direction has reversed
        if (initial_direction > 0 and price_change < 0) or (initial_direction < 0 and price_change > 0):
            reversal_date = current_day['date']
            reversal_price = current_day['c']
            break

In [157]:
# Tolerance level for "very close" comparison (e.g., 1% difference)
tolerance = 0.005
max_days = 10

# DataFrame to store results
recovery_data = pd.DataFrame(columns=[
    'Outlier Date', 'Outlier Close Price', 'Recovery Date', 'Recovery Close Price', 
    'Days to Recovery', 'Year', 'Reversal Date', 'Reversal Price'
])

# Step 1: Identify outliers
outliers = sorted_full_data[sorted_full_data['is_outlier'] == 1]

# Step 2: Iterate over each outlier to find the recovery date
for _, outlier in outliers.iterrows():
    outlier_date = outlier['date']
    outlier_close = outlier['c']
    year = outlier['year']
    
    # Step 3: Search for the first date after the outlier date where the close price is similar
    future_data = sorted_full_data[(sorted_full_data['date'] > outlier_date)]

    # Limit the search to the next 'max_days' business days
    future_data = future_data.head(max_days)
    
    similar_price_data = future_data[np.abs(future_data['c'] - outlier_close) <= tolerance * outlier_close]

    # Initialize variables for recovery and reversal information
    recovery_date = None
    recovery_close_price = None
    days_to_recovery = None
    reversal_date = None
    reversal_price = None
    
    # Step 4: Find a valid recovery date (must be more than 2 days)
    for i in range(len(similar_price_data)):
        current_day = similar_price_data.iloc[i]
        days_to_recovery = (current_day['date'] - outlier_date).days
        
        # Check if the recovery period is more than 2 days
        if days_to_recovery > 2:
            recovery_date = current_day['date']
            recovery_close_price = current_day['c']
            break
    
    # If a valid recovery date is found, search for the reversal point within the recovery period
    if recovery_date is not None:
        for i in range(1, len(future_data)):
            current_day = future_data.iloc[i]
            price_change = current_day['c'] - future_data.iloc[i-1]['c']
            
            # Ensure the current day is within the recovery period
            if current_day['date'] > recovery_date:
                break
            
            # Check if the direction has reversed
            if (initial_direction > 0 and price_change < 0) or (initial_direction < 0 and price_change > 0):
                reversal_date = current_day['date']
                reversal_price = current_day['c']
                break
    
    # If no valid recovery found, set to the max_days_date
    if recovery_date is None:
        max_days_date = outlier_date + BDay(max_days)
        max_days_close_price = sorted_full_data.loc[sorted_full_data['date'] == max_days_date, 'c'].values[0]
        recovery_date = max_days_date
        recovery_close_price = max_days_close_price
        days_to_recovery = max_days
        reversal_date = 'NaN'
        reversal_price = 'NaN'
    
    # Store the recovery and reversal information
    recovery_info = {
        'Outlier Date': outlier_date,
        'Outlier Close Price': outlier_close,
        'Recovery Date': recovery_date,
        'Recovery Close Price': recovery_close_price,
        'Days to Recovery': days_to_recovery,
        'Year': year,
        'Reversal Date': reversal_date,
        'Reversal Price': reversal_price
    }
    
    # Append the result to the recovery_data DataFrame
    recovery_data = pd.concat([recovery_data, pd.DataFrame([recovery_info])], ignore_index=True)

# Display the recovery data
print(recovery_data)

    Outlier Date  Outlier Close Price Recovery Date  Recovery Close Price  \
0     2010-05-05             1.117760    2010-05-13               1.11767   
1     2010-05-06             1.113180    2010-05-10               1.11110   
2     2010-05-18             1.153030    2010-05-21               1.14911   
3     2010-05-19             1.152320    2010-05-25               1.15739   
4     2010-06-17             1.111330    2010-06-21               1.10980   
..           ...                  ...           ...                   ...   
135   2023-07-11             0.878750    2023-07-25               0.86455   
136   2023-07-12             0.866590    2023-07-20               0.86623   
137   2023-07-13             0.859100    2023-07-17               0.86031   
138   2023-07-14             0.863036    2023-07-17               0.86031   
139   2023-07-17             0.860310    2023-07-25               0.86455   

    Days to Recovery  Year        Reversal Date Reversal Price  
0         

  recovery_data = pd.concat([recovery_data, pd.DataFrame([recovery_info])], ignore_index=True)


In [155]:


# Tolerance level for "very close" comparison (e.g., 1% difference)
tolerance = 0.005
max_days = 10

# DataFrame to store results
recovery_data_test = pd.DataFrame(columns=[
    'Outlier Date', 'Outlier Close Price', 'Recovery Date', 'Recovery Close Price', 
    'Days to Recovery', 'Year', 'Reversal Date', 'Reversal Price'
])

# Step 1: Identify outliers
outliers = sorted_full_data[sorted_full_data['is_outlier'] == 1]

# Step 2: Iterate over each outlier to find the recovery date
for _, outlier in outliers.iterrows():
    outlier_date = outlier['date']
    outlier_close = outlier['c']
    year = outlier['year']
    
    # Step 3: Search for the first date after the outlier date where the close price is similar
    future_data = sorted_full_data[(sorted_full_data['date'] > outlier_date)]

    # Limit the search to the next 'max_days' business days
    future_data = future_data.head(max_days)
    
    similar_price_data = future_data[np.abs(future_data['c'] - outlier_close) <= tolerance * outlier_close]

    # Step 4: Identify the initial direction
    initial_day = future_data.iloc[0]
    initial_direction = initial_day['c'] - outlier_close
    
    # Step 5: Find the reversal point
    reversal_date = None
    reversal_price = None

    # Modified part to avoid 1-day recoveries
    recovery_date = None
    recovery_close_price = None
    days_to_recovery = None    
    
    for i in range(len(similar_price_data)):
        current_day = similar_price_data.iloc[i]
        days_to_recovery = (current_day['date'] - outlier_date).days
        
        # Check if the recovery period is more than 1 day
        if days_to_recovery > 2:
            recovery_date = current_day['date']
            recovery_close_price = current_day['c']
            break
        
    for i in range(1, len(future_data)):
        current_day = future_data.iloc[i]
        price_change = current_day['c'] - future_data.iloc[i-1]['c']

        # # Ensure the current day is within the recovery period
        # if current_day['date'] > recovery_date:
        #     break
        
        # Check if the direction has reversed
        if (initial_direction > 0 and price_change < 0) or (initial_direction < 0 and price_change > 0):
            reversal_date = current_day['date']
            reversal_price = current_day['c']
            break
            
    # If no valid recovery found, set to the max_days_date
    if recovery_date is None:
        max_days_date = outlier_date + BDay(max_days)
        max_days_close_price = sorted_full_data.loc[sorted_full_data['date'] == max_days_date, 'c'].values[0]
        recovery_info = {
            'Outlier Date': outlier_date,
            'Outlier Close Price': outlier_close,
            'Recovery Date': max_days_date,
            'Recovery Close Price': max_days_close_price,
            'Days to Recovery': max_days,
            'Year': year,
            'Reversal Date': reversal_date,
            'Reversal Price': reversal_price
        }
    else:
        recovery_info = {
            'Outlier Date': outlier_date,
            'Outlier Close Price': outlier_close,
            'Recovery Date': recovery_date,
            'Recovery Close Price': recovery_close_price,
            'Days to Recovery': days_to_recovery,
            'Year': year,
            'Reversal Date': reversal_date,
            'Reversal Price': reversal_price
        }
    
    # Append the result to the recovery_data DataFrame
    recovery_data_test = pd.concat([recovery_data_test, pd.DataFrame([recovery_info])], ignore_index=True)

# Display the recovery data
print(recovery_data_test)


    Outlier Date  Outlier Close Price Recovery Date  Recovery Close Price  \
0     2010-05-05             1.117760    2010-05-13               1.11767   
1     2010-05-06             1.113180    2010-05-10               1.11110   
2     2010-05-18             1.153030    2010-05-21               1.14911   
3     2010-05-19             1.152320    2010-05-25               1.15739   
4     2010-06-17             1.111330    2010-06-21               1.10980   
..           ...                  ...           ...                   ...   
135   2023-07-11             0.878750    2023-07-25               0.86455   
136   2023-07-12             0.866590    2023-07-20               0.86623   
137   2023-07-13             0.859100    2023-07-17               0.86031   
138   2023-07-14             0.863036    2023-07-17               0.86031   
139   2023-07-17             0.860310    2023-07-25               0.86455   

    Days to Recovery  Year Reversal Date  Reversal Price  
0               

  recovery_data_test = pd.concat([recovery_data_test, pd.DataFrame([recovery_info])], ignore_index=True)


In [158]:
recovery_data.to_csv("/Users/nachiketkhare/Downloads/outlier_recovery_data.csv")

In [156]:
recovery_data_test.to_csv("/Users/nachiketkhare/Downloads/outlier_recovery_data_test.csv")