In [1]:
from pandas.tseries.offsets import BDay
import requests
from datetime import timedelta
from scipy.stats import pearsonr
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import LSTM, Dense, Reshape, Input, Masking, TimeDistributed
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.metrics import Precision, Recall
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import accuracy_score

# Data Preparatation

In [2]:
def fetch_data(url):
    response = requests.get(url)
    if response.status_code != 200:
        print("Error fetching data:", response.status_code, response.text)
        return None
    data = response.json()
    if 'results' not in data:
        print("No 'results' key in response:", data)
        return None
    return data

def calculate_daily_returns(df, prev_close=None):
    if prev_close is not None:
        df.loc[df.index[0], 'prev_close'] = prev_close
    else:
        df['prev_close'] = df['c'].shift(1)
    df['daily_return'] = (df['c'] - df['prev_close']) / df['prev_close']
    df['abs_daily_return'] = df['daily_return'].abs()
    return df

def get_top_outliers(df, n=10):
    return df.nlargest(n, 'abs_daily_return')

def update_outliers_list(current_df, historical_outliers_df, real_time_outliers_df, n=10):
    if 'source' not in current_df.columns:
        current_df['source'] = 'real-time'
    combined_df = pd.concat([historical_outliers_df, current_df])
    updated_outliers_df = combined_df.nlargest(n, 'abs_daily_return')
    updated_historical_outliers_df = updated_outliers_df[updated_outliers_df['source'] == 'historical']
    updated_real_time_outliers_df = updated_outliers_df[updated_outliers_df['source'] == 'real-time']
    return updated_historical_outliers_df, updated_real_time_outliers_df

def convert_timestamps(df):
    df['date'] = pd.to_datetime(df['t'], unit='ms')
    df.drop(columns=['t'], inplace=True)
    return df

# API key and endpoints
api_key = 'beBybSi8daPgsTp5yx5cHtHpYcrjp5Jq'
today = pd.Timestamp.now().date()
start_date = today - pd.DateOffset(years=1)
start_date_formatted = start_date.strftime('%Y-%m-%d')
end_date = today - pd.DateOffset(days=1)
end_date_formatted = end_date.strftime('%Y-%m-%d')
pair = 'C:USDCHF'
historical_url = f'https://api.polygon.io/v2/aggs/ticker/{pair}/range/1/day/{start_date_formatted}/{end_date_formatted}?adjusted=true&sort=asc&apiKey={api_key}'
real_time_url = f'https://api.polygon.io/v2/aggs/ticker/{pair}/range/1/day/{today}/{today}?adjusted=true&sort=asc&apiKey={api_key}'

# Fetch and process historical data
historical_data = fetch_data(historical_url)
if historical_data:
    historical_df = pd.DataFrame(historical_data['results'])
    historical_df = convert_timestamps(historical_df)
    historical_df = calculate_daily_returns(historical_df)
    historical_df['source'] = 'historical'
    historical_outliers_df = get_top_outliers(historical_df)
else:
    print("Failed to fetch or process historical data.")

# Fetch and process real-time data
real_time_data = fetch_data(real_time_url)
if real_time_data and 'results' in real_time_data:
    real_time_df = pd.DataFrame(real_time_data['results'])
    real_time_df = convert_timestamps(real_time_df)
    # Use the last close from historical data
    last_close = historical_df['c'].iloc[-1] if not historical_df.empty else None
    real_time_df = calculate_daily_returns(real_time_df, prev_close=last_close)
    real_time_df['source'] = 'real-time'
    updated_historical_outliers_df, updated_real_time_outliers_df = update_outliers_list(real_time_df, historical_outliers_df, pd.DataFrame())
    # Update historical data
    historical_df = pd.concat([historical_df.iloc[1:], real_time_df])  # Keep historical data rolling
else:
    print("No new data available or failed to fetch real-time data.")
    
# Combine data for Top 10 Outliers
full_outlier_df = pd.concat([updated_historical_outliers_df, updated_real_time_outliers_df])

# Print the Outliers
full_outlier_df

Unnamed: 0,v,vw,o,c,h,l,n,date,prev_close,daily_return,abs_daily_return,source
222,187960,0.8932,0.88558,0.8975,0.89937,0.8836,187960,2024-03-21,0.88555,0.013494,0.013494,historical
106,168024,0.8965,0.90127,0.8891,0.90271,0.8875,168024,2023-11-14,0.9012,-0.013427,0.013427,historical
190,191642,0.8825,0.8758,0.88742,0.8881,0.8752,191642,2024-02-13,0.87581,0.013256,0.013256,historical
149,174820,0.8495,0.85359,0.8425,0.85487,0.8406,174820,2023-12-27,0.85372,-0.013142,0.013142,historical
323,160058,0.8871,0.89375,0.88195,0.89411,0.88166,160058,2024-07-17,0.89338,-0.012794,0.012794,historical
239,159390,0.9089,0.90297,0.91314,0.9147,0.9026,159390,2024-04-10,0.90303,0.011196,0.011196,historical
282,172177,0.9074,0.91319,0.90332,0.91408,0.90053,172177,2024-05-30,0.91311,-0.010722,0.010722,historical
181,184263,0.861,0.8576,0.866698,0.8682,0.8551,184263,2024-02-02,0.8576,0.010609,0.010609,historical
256,180821,0.9142,0.91075,0.91959,0.91965,0.9102,180821,2024-04-30,0.91072,0.00974,0.00974,historical
154,148722,0.8473,0.84185,0.85004,0.85067,0.8415,148722,2024-01-02,0.84192,0.009645,0.009645,historical


In [3]:
sorted_outliers_data = full_outlier_df.sort_values(by="date")
sorted_outliers_data

Unnamed: 0,v,vw,o,c,h,l,n,date,prev_close,daily_return,abs_daily_return,source
106,168024,0.8965,0.90127,0.8891,0.90271,0.8875,168024,2023-11-14,0.9012,-0.013427,0.013427,historical
149,174820,0.8495,0.85359,0.8425,0.85487,0.8406,174820,2023-12-27,0.85372,-0.013142,0.013142,historical
154,148722,0.8473,0.84185,0.85004,0.85067,0.8415,148722,2024-01-02,0.84192,0.009645,0.009645,historical
181,184263,0.861,0.8576,0.866698,0.8682,0.8551,184263,2024-02-02,0.8576,0.010609,0.010609,historical
190,191642,0.8825,0.8758,0.88742,0.8881,0.8752,191642,2024-02-13,0.87581,0.013256,0.013256,historical
222,187960,0.8932,0.88558,0.8975,0.89937,0.8836,187960,2024-03-21,0.88555,0.013494,0.013494,historical
239,159390,0.9089,0.90297,0.91314,0.9147,0.9026,159390,2024-04-10,0.90303,0.011196,0.011196,historical
256,180821,0.9142,0.91075,0.91959,0.91965,0.9102,180821,2024-04-30,0.91072,0.00974,0.00974,historical
282,172177,0.9074,0.91319,0.90332,0.91408,0.90053,172177,2024-05-30,0.91311,-0.010722,0.010722,historical
323,160058,0.8871,0.89375,0.88195,0.89411,0.88166,160058,2024-07-17,0.89338,-0.012794,0.012794,historical


In [4]:
def fetch_data(url):
    response = requests.get(url)
    if response.status_code != 200:
        print("Error fetching data:", response.status_code, response.text)
        return None
    data = response.json()
    if 'results' not in data:
        print("No 'results' key in response:", data)
        return None
    return data

def calculate_daily_returns_threshold(df, prev_close=None):
    if prev_close is not None:
        df.loc[df.index[0], 'prev_close'] = prev_close
    else:
        df['prev_close'] = df['c'].shift(1)
    df['daily_return'] = (df['c'] - df['prev_close']) / df['prev_close']
    # print(df['daily_return'])
    df['abs_daily_return'] = df['daily_return'].abs()
    return df

def convert_timestamps(df):
    df['date'] = pd.to_datetime(df['t'], unit='ms')
    df.drop(columns=['t'], inplace=True)
    return df


def fetch_daily_return_prior(symbol, current_date, previous_date, api_key):
    url = f'https://api.polygon.io/v2/aggs/ticker/{symbol}/range/1/day/{previous_date}/{current_date}?adjusted=true&sort=asc&apiKey={api_key}'
    data = fetch_data(url)
    if data and 'results' in data and len(data['results']) > 0:
        df = pd.DataFrame(data['results'])
        # print(df)
        df = convert_timestamps(df)
        # print(f"converted: {df}")
        df = df.loc[::-1].reset_index(drop=True)
        df = calculate_daily_returns_threshold(df)
        print(f"daily return: {df}")
        # print(f"daily_return {df['abs_daily_return']}")
        if len(df['abs_daily_return']) == 2:
            return df['abs_daily_return'].iloc[1]
    return None
    

def find_prior_outlier_threshold(df, symbol, api_key):
    df = df.sort_values(by='date').reset_index(drop=True)
    df['date'] = pd.to_datetime(df['date'])
    
    results = []
    
    for index, row in df.iterrows():
        current_date = row['date']
        current_return = row['abs_daily_return']
        print(f"outlier date: {current_date}")
        while True:
            # print(f"current_return: {current_return}")
            previous_date = current_date - pd.DateOffset(days=1)
            # print(f"previous date: {previous_date}")
            current_date_str = current_date.strftime('%Y-%m-%d')
            previous_date_str = previous_date.strftime('%Y-%m-%d')
            previous_return = fetch_daily_return_prior(symbol, current_date_str, previous_date_str, api_key)
            # print(f"previous return: {previous_return}")
            if previous_return is not None:
                if (current_return - previous_return) < 0:
                    
                    days_difference = (row['date'] - previous_date).days
                    # print(current_return - previous_return)
                    results.append({
                        'outlier_date': row['date'],
                        'prior_outlier_threshold_date': previous_date,
                        'days_difference': days_difference
                    })
                    # print()
                    break
                else:
                    current_date = previous_date
                    current_return = previous_return
            else:
                # print(f"Failed to fetch data for {previous_date_str}")
                # break
                current_date = previous_date

    return pd.DataFrame(results)


# Applying the corrected function to the data
prior_outlier_thresholds = find_prior_outlier_threshold(sorted_outliers_data, pair, api_key)

outlier date: 2023-11-14 00:00:00
daily return:         v      vw        o       c        h       l       n       date  \
0  168024  0.8965  0.90127  0.8891  0.90271  0.8875  168024 2023-11-14   
1  149889  0.9024  0.90192  0.9012  0.90530  0.9004  149889 2023-11-13   

   prev_close  daily_return  abs_daily_return  
0         NaN           NaN               NaN  
1      0.8891      0.013609          0.013609  
outlier date: 2023-12-27 00:00:00
daily return:         v      vw        o        c        h       l       n       date  \
0  174820  0.8495  0.85359  0.84250  0.85487  0.8406  174820 2023-12-27   
1  112192  0.8555  0.85600  0.85372  0.85796  0.8528  112192 2023-12-26   

   prev_close  daily_return  abs_daily_return  
0         NaN           NaN               NaN  
1      0.8425      0.013318          0.013318  
outlier date: 2024-01-02 00:00:00
daily return:         v      vw        o        c        h         l       n       date  \
0  148722  0.8473  0.84185  0.85004  0.850

In [5]:
def fetch_daily_return_post(symbol, current_date, post_date, api_key):
    url = f'https://api.polygon.io/v2/aggs/ticker/{symbol}/range/1/day/{current_date}/{post_date}?adjusted=true&sort=asc&apiKey={api_key}'
    data = fetch_data(url)
    if data and 'results' in data and len(data['results']) > 0:
        df = pd.DataFrame(data['results'])
        # print(df)
        df = convert_timestamps(df)
        # print(f"converted: {df}")
        df = calculate_daily_returns_threshold(df)
        # print(f"daily return: {df}")
        # print(f"daily_return {df['abs_daily_return']}")
        if len(df['abs_daily_return']) == 2:
            return df['abs_daily_return'].iloc[1]
    return None


def find_post_outlier_threshold(df, symbol, api_key):
    df = df.sort_values(by='date').reset_index(drop=True)
    df['date'] = pd.to_datetime(df['date'])
    
    results = []
    
    for index, row in df.iterrows():
        current_date = row['date']
        current_return = row['abs_daily_return']
        # print(f"outlier date: {current_date}")
        while True:
            # print(f"current_return: {current_return}")
            post_date = current_date + pd.DateOffset(days=1)
            # print(f"previous date: {previous_date}")
            current_date_str = current_date.strftime('%Y-%m-%d')
            post_date_str = post_date.strftime('%Y-%m-%d')
            post_return = fetch_daily_return_post(symbol, current_date_str, post_date_str, api_key)
            # print(f"post return: {post_return}")
            if post_return is not None:
                if (current_return - post_return) < 0:
                    
                    days_difference = (post_date - row['date']).days
                    # print(current_return - post_return)
                    results.append({
                        'outlier_date': row['date'],
                        'post_outlier_threshold_date': post_date,
                        'days_difference': days_difference
                    })
                    # print()
                    break
                else:
                    current_date = post_date
                    current_return = post_return
            else:
                # print(f"Failed to fetch data for {post_date_str}")
                # break
                current_date = post_date

    return pd.DataFrame(results)

post_outlier_thresholds = find_post_outlier_threshold(sorted_outliers_data, pair, api_key)

In [6]:
prior_outlier_thresholds.rename(columns={'outlier_date': 'date'}, inplace=True)
sorted_outliers_data = pd.merge(
    sorted_outliers_data,
    prior_outlier_thresholds[['date', 'days_difference']],
    on = 'date',
    how='left'
)
sorted_outliers_data.rename(columns={'days_difference':'pre_threshold'}, inplace=True)

post_outlier_thresholds.rename(columns={'outlier_date': 'date'}, inplace=True)
sorted_outliers_data = pd.merge(
    sorted_outliers_data,
    post_outlier_thresholds[['date', 'days_difference']],
    on = 'date',
)
sorted_outliers_data.rename(columns={'days_difference':'post_threshold'}, inplace=True)

In [7]:
sorted_outliers_data

Unnamed: 0,v,vw,o,c,h,l,n,date,prev_close,daily_return,abs_daily_return,source,pre_threshold,post_threshold
0,168024,0.8965,0.90127,0.8891,0.90271,0.8875,168024,2023-11-14,0.9012,-0.013427,0.013427,historical,1,3
1,174820,0.8495,0.85359,0.8425,0.85487,0.8406,174820,2023-12-27,0.85372,-0.013142,0.013142,historical,1,2
2,148722,0.8473,0.84185,0.85004,0.85067,0.8415,148722,2024-01-02,0.84192,0.009645,0.009645,historical,5,2
3,184263,0.861,0.8576,0.866698,0.8682,0.8551,184263,2024-02-02,0.8576,0.010609,0.010609,historical,4,5
4,191642,0.8825,0.8758,0.88742,0.8881,0.8752,191642,2024-02-13,0.87581,0.013256,0.013256,historical,5,2
5,187960,0.8932,0.88558,0.8975,0.89937,0.8836,187960,2024-03-21,0.88555,0.013494,0.013494,historical,4,4
6,159390,0.9089,0.90297,0.91314,0.9147,0.9026,159390,2024-04-10,0.90303,0.011196,0.011196,historical,7,2
7,180821,0.9142,0.91075,0.91959,0.91965,0.9102,180821,2024-04-30,0.91072,0.00974,0.00974,historical,7,2
8,172177,0.9074,0.91319,0.90332,0.91408,0.90053,172177,2024-05-30,0.91311,-0.010722,0.010722,historical,1,4
9,160058,0.8871,0.89375,0.88195,0.89411,0.88166,160058,2024-07-17,0.89338,-0.012794,0.012794,historical,1,5


In [8]:
# Convert dates in dataset to datetime objects
sorted_outliers_data['date'] = pd.to_datetime(sorted_outliers_data['date'])

date_ranges = pd.DataFrame({
    "start_date": sorted_outliers_data.apply(lambda row: row['date'] - BDay(row['pre_threshold']), axis=1),
    "end_date": sorted_outliers_data.apply(lambda row: row['date'] + BDay(row['post_threshold']), axis=1),
    "outlier_date": sorted_outliers_data['date'],
    # "year": sorted_outliers_data['year']
})

date_ranges

Unnamed: 0,start_date,end_date,outlier_date
0,2023-11-13,2023-11-17,2023-11-14
1,2023-12-26,2023-12-29,2023-12-27
2,2023-12-26,2024-01-04,2024-01-02
3,2024-01-29,2024-02-09,2024-02-02
4,2024-02-06,2024-02-15,2024-02-13
5,2024-03-15,2024-03-27,2024-03-21
6,2024-04-01,2024-04-12,2024-04-10
7,2024-04-19,2024-05-02,2024-04-30
8,2024-05-29,2024-06-05,2024-05-30
9,2024-07-16,2024-07-24,2024-07-17


In [9]:
def classify_recovery_range(df, start_date, end_date, fast_threshold=4):
    # Filter the data for the specified range
    range_data = df[(df['date'] >= start_date) & (df['date'] <= end_date)]
    if range_data.empty:
        return "no data"
    
    # Calculate cumulative percentage change
    cumulative_change = range_data['pct_change'].sum()
    
    # Classify the recovery based on the cumulative change
    if cumulative_change >= fast_threshold:
        return 'fast recovery'
    else:
        return 'slow recovery'

def calculate_daily_returns(df, prev_close=None):
    if prev_close is not None:
        df.loc[df.index[0], 'prev_close'] = prev_close
    else:
        df['prev_close'] = df['c'].shift(1)
    df['daily_return'] = (df['c'] - df['prev_close']) / df['prev_close']
    df['abs_daily_return'] = df['daily_return'].abs()
    df['pct_change'] = df['c'].pct_change() * 100
    
    df['direction'] = df['pct_change'].apply(lambda x: 'Up' if x > 0 else 'Down')
    df['pct_change'] = df['pct_change'].abs()
    
    return df

In [10]:
def fetch_hourly_data_chunk(symbol, start_date, end_date, api_key):
    formatted_start_date = start_date.strftime('%Y-%m-%d')
    formatted_end_date = end_date.strftime('%Y-%m-%d')

    url = f"https://api.polygon.io/v2/aggs/ticker/{symbol}/range/1/hour/{formatted_start_date}/{formatted_end_date}?apiKey={api_key}"
    response = requests.get(url)
    
    if response.status_code != 200:
        print(f"Failed to fetch data: {response.status_code} - {response.text}")
        return None
    
    response_data = response.json()
    
    if 'results' not in response_data:
        print(f"No 'results' in response: {response_data}")
        return None

    df = pd.DataFrame(response_data['results'])
    df['date'] = pd.to_datetime(df['t'], unit='ms')
    df.drop(columns=['t'], inplace=True)
    
    return df

def fetch_and_process_hourly_data(symbol, start_date, end_date, api_key):
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_colwidth', None)
    
    # Split the date range into smaller chunks
    chunk_size = 3  # Fetch data in 7-day chunks
    date_ranges = [(start_date + timedelta(days=i*chunk_size), 
                    min(end_date, start_date + timedelta(days=(i+1)*chunk_size - 1)))
                   for i in range((end_date - start_date).days // chunk_size + 1)]

    # print((end_date - start_date).days // chunk_size + 1)
    all_data = []

    for start, end in date_ranges:
        chunk_data = fetch_hourly_data_chunk(symbol, start, end, api_key)
        if chunk_data is not None:
            all_data.append(chunk_data)
    
    if not all_data:
        print("No data fetched")
        return None
    
    df = pd.concat(all_data)
    hourly_data = calculate_daily_returns(df)
    hourly_data['Recovery'] = classify_recovery_range(hourly_data, start_date, end_date)
    hourly_data.set_index('date', inplace=True)
    
    full_index = pd.date_range(start=start_date, end=end_date + timedelta(days=1), freq='H')
    hourly_data = hourly_data.reindex(full_index)
    
    hourly_data.reset_index(inplace=True)
    hourly_data.rename(columns={'index': 'date'}, inplace=True)
    
    return hourly_data

In [11]:
# Convert start_date, end_date, and outlier_date to Timestamp for consistent comparison
start_date_co = pd.Timestamp(date_ranges['start_date'].iloc[-1])
end_date_co = pd.Timestamp(date_ranges['end_date'].iloc[-1]) + pd.Timedelta(days=1)  # Extend the end date by one additional day
outlier_date_co = pd.Timestamp(date_ranges['outlier_date'].iloc[-1])

# Get daily data for the range including days before and after the outlier
daily_data = fetch_and_process_hourly_data(pair, start_date_co, end_date_co, api_key)

# Filter out weekdends
daily_data = daily_data[~daily_data['date'].dt.weekday.isin([5,6])]

# prior_data from start_date to outlier_date inclusive
prior_data = daily_data[(daily_data['date'] >= start_date_co) & (daily_data['date'] < outlier_date_co)]
prior_data["day type"] = "prior day"

# outlier_data is for the hourly data on the day of the outlier
outlier_data = daily_data[(daily_data['date'].dt.date == outlier_date_co.date())]
outlier_data["day type"] = "outlier day"

# post_data from the day after outlier_date to end_date
post_outlier_ts = outlier_date_co + pd.Timedelta(days=1)  # Starting the day after the outlier_date
post_data = daily_data[(daily_data['date'] >= post_outlier_ts) & (daily_data['date'] <= end_date_co)]
post_data["day type"] = "post day"

  full_index = pd.date_range(start=start_date, end=end_date + timedelta(days=1), freq='H')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prior_data["day type"] = "prior day"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  outlier_data["day type"] = "outlier day"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  post_data["day type"] = "post day"


In [12]:
prior_data

Unnamed: 0,date,v,vw,o,c,h,l,n,prev_close,daily_return,abs_daily_return,pct_change,direction,Recovery,day type
0,2024-07-16 00:00:00,3521.0,0.8956,0.89548,0.89568,0.89582,0.8952,3521.0,,,,,Down,fast recovery,prior day
1,2024-07-16 01:00:00,3632.0,0.8957,0.89565,0.89578,0.89596,0.8952,3632.0,0.89568,0.000112,0.000112,0.011165,Up,fast recovery,prior day
2,2024-07-16 02:00:00,2833.0,0.8958,0.8956,0.89581,0.89601,0.8955,2833.0,0.89578,3.3e-05,3.3e-05,0.003349,Up,fast recovery,prior day
3,2024-07-16 03:00:00,2702.0,0.8957,0.89583,0.8955,0.89584,0.8953,2702.0,0.89581,-0.000346,0.000346,0.034606,Down,fast recovery,prior day
4,2024-07-16 04:00:00,5261.0,0.8954,0.89569,0.895,0.8958,0.8949,5261.0,0.8955,-0.000558,0.000558,0.055835,Down,fast recovery,prior day
5,2024-07-16 05:00:00,5948.0,0.8953,0.89527,0.89515,0.8955,0.8949,5948.0,0.895,0.000168,0.000168,0.01676,Up,fast recovery,prior day
6,2024-07-16 06:00:00,7917.0,0.895,0.89513,0.8945,0.89531,0.8943,7917.0,0.89515,-0.000726,0.000726,0.072614,Down,fast recovery,prior day
7,2024-07-16 07:00:00,8885.0,0.8948,0.89475,0.89499,0.89519,0.8942,8885.0,0.8945,0.000548,0.000548,0.054779,Up,fast recovery,prior day
8,2024-07-16 08:00:00,7752.0,0.8951,0.89504,0.89516,0.89554,0.8946,7752.0,0.89499,0.00019,0.00019,0.018995,Up,fast recovery,prior day
9,2024-07-16 09:00:00,8434.0,0.8954,0.89515,0.8955,0.8959,0.8947,8434.0,0.89516,0.00038,0.00038,0.037982,Up,fast recovery,prior day


In [13]:
post_data

Unnamed: 0,date,v,vw,o,c,h,l,n,prev_close,daily_return,abs_daily_return,pct_change,direction,Recovery,day type
48,2024-07-18 00:00:00,5674.0,0.883,0.88203,0.88362,0.88372,0.882,5674.0,0.88195,0.001894,0.001894,0.189353,Up,fast recovery,post day
49,2024-07-18 01:00:00,5576.0,0.8841,0.88363,0.88433,0.88455,0.8834,5576.0,0.88362,0.000804,0.000804,0.080351,Up,fast recovery,post day
50,2024-07-18 02:00:00,4355.0,0.8844,0.8843,0.88439,0.88457,0.8839,4355.0,0.88433,6.8e-05,6.8e-05,0.006785,Up,fast recovery,post day
51,2024-07-18 03:00:00,4009.0,0.8844,0.8844,0.88465,0.88471,0.8838,4009.0,0.88439,0.000294,0.000294,0.029399,Up,fast recovery,post day
52,2024-07-18 04:00:00,6461.0,0.8845,0.8846,0.88462,0.8848,0.884,6461.0,0.88465,-3.4e-05,3.4e-05,0.003391,Down,fast recovery,post day
53,2024-07-18 05:00:00,7241.0,0.8842,0.88461,0.8835,0.88471,0.8834,7241.0,0.88462,-0.001266,0.001266,0.126608,Down,fast recovery,post day
54,2024-07-18 06:00:00,8892.0,0.884,0.8837,0.88391,0.88432,0.8834,8892.0,0.8835,0.000464,0.000464,0.046406,Up,fast recovery,post day
55,2024-07-18 07:00:00,11086.0,0.8836,0.88391,0.88376,0.88446,0.8824,11086.0,0.88391,-0.00017,0.00017,0.01697,Down,fast recovery,post day
56,2024-07-18 08:00:00,9256.0,0.8839,0.88377,0.8841,0.88454,0.883,9256.0,0.88376,0.000385,0.000385,0.038472,Up,fast recovery,post day
57,2024-07-18 09:00:00,7024.0,0.8841,0.88426,0.88395,0.8845,0.8835,7024.0,0.8841,-0.00017,0.00017,0.016966,Down,fast recovery,post day


# LSTM Model

In [None]:
# # Load and preprocess data
# data = pd.read_csv('/Users/priyakundu/Downloads/USDCHF_hourly.csv')

# # Preprocess data
# def preprocess_data(data):
#     # Encode categorical columns
#     le_direction = LabelEncoder()
#     le_recovery = LabelEncoder()
#     data['direction'] = le_direction.fit_transform(data['direction'])
#     data['Recovery'] = le_recovery.fit_transform(data['Recovery'])

#     # Scale numeric columns
#     scaler = MinMaxScaler(feature_range=(0, 1))
#     numeric_columns = ['vw', 'o', 'c', 'h', 'l', 'n', 'prev_close', 'daily_return', 'abs_daily_return', 'pct_change']
#     data[numeric_columns] = scaler.fit_transform(data[numeric_columns])

#     return data, le_direction, le_recovery, scaler

# # Assuming data preprocessing function from previous discussion
# data, le_direction, le_recovery, scaler = preprocess_data(data)

# # Function to prepare sequence
# def prepare_sequences(data, max_seq_length):
#     X, y_dir, y_rec = [], [], []
#     grouped = data.groupby('outlier_id')
    
#     for _, group in grouped:
#         features = group[['vw', 'o', 'c', 'h', 'l', 'n', 'prev_close', 'daily_return', 'abs_daily_return', 'pct_change']].values
#         label_dir = group['direction'].iloc[:24]  # Assume we predict the last label of each group
#         label_rec = group['Recovery'].iloc[:24]   # Assume we predict the last label of each group
        
#         # Pad features to ensure uniform length
#         padded_features = pad_sequences([features], maxlen=max_seq_length, dtype='float32', padding='post', truncating='post')[0]
        
#         X.append(padded_features)
#         y_dir.append(label_dir)
#         y_rec.append(label_rec)
    
#     return np.array(X), np.array(y_dir), np.array(y_rec)

# # Determine a suitable max_seq_length
# max_seq_length = max(data.groupby('outlier_id').apply(lambda x: len(x)))
# X, y_dir, y_rec = prepare_sequences(data, max_seq_length)


# # Model architecture
# input_layer = Input(shape=(max_seq_length, X.shape[2]))  # Fixed sequence length
# lstm_layer = LSTM(50)(input_layer)
# output_dir = Dense(np.max(y_dir) + 1, activation='softmax', name='direction_output')(lstm_layer)
# output_rec = Dense(np.max(y_rec) + 1, activation='softmax', name='recovery_output')(lstm_layer)

# model = Model(inputs=input_layer, outputs=[output_dir, output_rec])

# # Compile the model
# model.compile(optimizer=Adam(learning_rate=0.005),
#               loss={'direction_output': 'sparse_categorical_crossentropy',
#                     'recovery_output': 'sparse_categorical_crossentropy'},
#               metrics={'direction_output': ['accuracy'],
#                        'recovery_output': ['accuracy']})

# # Train the model
# model.fit(X, {'direction_output': y_dir, 'recovery_output': y_rec}, epochs=50, batch_size=32)

In [None]:
# # Preprocess prior_data
# prior_data_processed, _, _, _ = preprocess_data(prior_data)

# # Prepare sequences for testing
# def prepare_test_sequences(data, max_seq_length=20, required_sequences=72):
#     X_test = []
#     inputs = data[['vw', 'o', 'c', 'h', 'l', 'n', 'prev_close', 'daily_return', 'abs_daily_return', 'pct_change']].values
#     # Loop to generate sequences, ensuring you have enough data to form required_sequences
#     for i in range(min(len(inputs) - max_seq_length + 1, required_sequences)):
#         X_test.append(inputs[i:i+max_seq_length])
#     X_test_padded = pad_sequences(X_test, maxlen=max_seq_length, dtype='float32', padding='post', truncating='post')
#     return np.array(X_test_padded)

# # Generate test sequences ensuring to produce enough sequences up to 72 if possible
# X_test = prepare_test_sequences(prior_data, required_sequences=72)

# # Make predictions
# predictions = model.predict(X_test)
# predicted_direction = np.argmax(predictions[0], axis=1)
# predicted_recovery = np.argmax(predictions[1], axis=1)

# # Adjust post_day DataFrame to match the predictions length
# if len(post_data) > 72:
#     post_data = post_data.iloc[:72]
# elif len(post_data) < 72:
#     post_data = post_data.iloc[:36]
# elif len(post_data) < 36:
#     post_data = post_data.iloc[:24]

# # Extract actual labels
# actual_direction = le_direction.transform(post_data['direction'])
# actual_recovery = le_recovery.transform(post_data['Recovery'])

# # Calculate accuracy
# direction_accuracy = accuracy_score(actual_direction, predicted_direction)
# recovery_accuracy = accuracy_score(actual_recovery, predicted_recovery)

# print(f"Direction Accuracy: {direction_accuracy * 100:.2f}%")
# print(f"Recovery Accuracy: {recovery_accuracy * 100:.2f}%")

In [None]:
# def prepare_test_sequences(data, max_seq_length=20, total_sequences=72):
#     inputs = data[data['day type'] == 'prior day'][['vw', 'o', 'c', 'h', 'l', 'n', 'prev_close', 'daily_return', 'abs_daily_return', 'pct_change']].values
#     # Ensure that you have enough data to create the required number of sequences
#     sequence_count = min(len(inputs) - max_seq_length + 1, total_sequences)
#     X_test = [inputs[i:i+max_seq_length] for i in range(sequence_count)]
#     X_test_padded = pad_sequences(X_test, maxlen=max_seq_length, dtype='float32', padding='post', truncating='post')
#     return np.array(X_test_padded)

# # Apply this function to your test data
# X_test = prepare_test_sequences(prior_data)

# predictions = model.predict(X_test)
# predicted_direction = np.argmax(predictions[0], axis=1)
# predicted_recovery = np.argmax(predictions[1], axis=1)
# print(f"Predicted {len(predicted_direction)} directions and {len(predicted_recovery)} recoveries.")

# if len(post_data) != len(predicted_direction):
#     print(f"Adjusting post_data from {len(post_data)} to {len(predicted_direction)} records to match predictions.")
#     post_data = post_data.iloc[:len(predicted_direction)]

# actual_direction = le_direction.transform(post_data['direction'])
# actual_recovery = le_recovery.transform(post_data['Recovery'])

# direction_accuracy = accuracy_score(actual_direction, predicted_direction)
# recovery_accuracy = accuracy_score(actual_recovery, predicted_recovery)

# print(f"Direction Accuracy: {direction_accuracy * 100:.2f}%")
# print(f"Recovery Accuracy: {recovery_accuracy * 100:.2f}%")


# LSTM NEWS

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

# Load the dataset
file_path = '/Users/priyakundu/Downloads/USDCHF_hourly.csv'
data = pd.read_csv(file_path)

# Preprocess the data
def preprocess_data(data):
    # Encode categorical variables
    le_direction = LabelEncoder()
    le_recovery = LabelEncoder()
    le_day_type = LabelEncoder()
    
    data['direction'] = le_direction.fit_transform(data['direction'])
    data['Recovery'] = le_recovery.fit_transform(data['Recovery'])
    data['day type'] = le_day_type.fit_transform(data['day type'])
    
    # Scale numerical features
    scaler = MinMaxScaler()
    data[['vw', 'o', 'c', 'h', 'l', 'n', 'prev_close', 'daily_return', 'abs_daily_return', 'pct_change']] = scaler.fit_transform(
        data[['vw', 'o', 'c', 'h', 'l', 'n', 'prev_close', 'daily_return', 'abs_daily_return', 'pct_change']]
    )
    
    return data, le_direction, le_recovery, le_day_type, scaler

data, le_direction, le_recovery, le_day_type, scaler = preprocess_data(data)

# Prepare sequences for LSTM
def create_sequences(data, sequence_length):
    sequences = []
    for i in range(len(data) - sequence_length):
        sequence = data[i:i+sequence_length]
        sequences.append(sequence)
    return np.array(sequences)

sequence_length = 24  # 24 hours prior data
data_sequences = create_sequences(data, sequence_length)

# Separate features and labels
X = data_sequences[:, :-1, :-2]  # Features excluding 'direction' and 'Recovery'
y_direction = data_sequences[:, -1, -2]  # 'direction' label
y_recovery = data_sequences[:, -1, -1]  # 'Recovery' label

# Reshape for LSTM
X = X.reshape((X.shape[0], X.shape[1], X.shape[2]))

# Convert labels to the appropriate format
y_direction = np.array(y_direction).astype(int)
y_recovery = np.array(y_recovery).astype(int)

# Build the LSTM model
model = Sequential()
model.add(LSTM(100, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(50, return_sequences=False))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(2, activation='softmax'))  # Output layer for two classes (direction and recovery)

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X, np.stack((y_direction, y_recovery), axis=-1), epochs=10, batch_size=64, validation_split=0.2)

In [None]:
# Function to predict direction and recovery
def predict_direction_recovery(prior_data, model, sequence_length):
    # Preprocess prior data
    prior_data, _, _, _ = preprocess_data(prior_data)
    prior_sequences = create_sequences(prior_data.values, sequence_length)
    X_prior = prior_sequences[:, :-1, :-2]
    X_prior = X_prior.reshape((X_prior.shape[0], X_prior.shape[1], X_prior.shape[2]))
    
    predictions = model.predict(X_prior)
    direction_pred = np.argmax(predictions[:, 0])
    recovery_pred = np.argmax(predictions[:, 1])
    
    direction_pred = le_direction.inverse_transform([direction_pred])
    recovery_pred = le_recovery.inverse_transform([recovery_pred])
    
    return direction_pred, recovery_pred

direction_pred, recovery_pred = predict_direction_recovery(prior_data, model, sequence_length)

print(f'Predicted Direction: {direction_pred}')
print(f'Predicted Recovery: {recovery_pred}')

In [None]:
# Evaluation
post_data, _, _, _ = preprocess_data(post_data)
y_true_direction = post_data['direction']
y_true_recovery = post_data['Recovery']

# Visualize the predictions
plt.figure(figsize=(14, 6))

# Plot direction
plt.subplot(2, 1, 1)
plt.plot(y_true_direction, label='Actual Direction', color='blue')
plt.plot([direction_pred]*len(y_true_direction), label='Predicted Direction', color='red', linestyle='dashed')
plt.title('Direction Prediction')
plt.xlabel('Hour')
plt.ylabel('Direction')
plt.legend()

# Plot recovery
plt.subplot(2, 1, 2)
plt.plot(y_true_recovery, label='Actual Recovery', color='blue')
plt.plot([recovery_pred]*len(y_true_recovery), label='Predicted Recovery', color='red', linestyle='dashed')
plt.title('Recovery Prediction')
plt.xlabel('Hour')
plt.ylabel('Recovery')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
# Evaluation Metrics
accuracy_direction = accuracy_score(y_true_direction, [direction_pred]*len(y_true_direction))
precision_direction = precision_score(y_true_direction, [direction_pred]*len(y_true_direction), average='weighted')
recall_direction = recall_score(y_true_direction, [direction_pred]*len(y_true_direction), average='weighted')
f1_direction = f1_score(y_true_direction, [direction_pred]*len(y_true_direction), average='weighted')

accuracy_recovery = accuracy_score(y_true_recovery, [recovery_pred]*len(y_true_recovery))
precision_recovery = precision_score(y_true_recovery, [recovery_pred]*len(y_true_recovery), average='weighted')
recall_recovery = recall_score(y_true_recovery, [recovery_pred]*len(y_true_recovery), average='weighted')
f1_recovery = f1_score(y_true_recovery, [recovery_pred]*len(y_true_recovery), average='weighted')

print(f'Accuracy (Direction): {accuracy_direction:.4f}')
print(f'Precision (Direction): {precision_direction:.4f}')
print(f'Recall (Direction): {recall_direction:.4f}')
print(f'F1-Score (Direction): {f1_direction:.4f}')

print(f'Accuracy (Recovery): {accuracy_recovery:.4f}')
print(f'Precision (Recovery): {precision_recovery:.4f}')
print(f'Recall (Recovery): {recall_recovery:.4f}')
print(f'F1-Score (Recovery): {f1_recovery:.4f}')

In [None]:
# Print classification reports
print("Classification Report (Direction):")
print(classification_report(y_true_direction, [direction_pred]*len(y_true_direction)))
print("Classification Report (Recovery):")
print(classification_report(y_true_recovery, [recovery_pred]*len(y_true_recovery)))

# BRAND NEW

In [18]:
from keras.layers import LSTM, Dense, TimeDistributed, RepeatVector

# Load dataset
data = pd.read_csv('USDCHF_hourly.csv')

# Encode categorical columns
le_direction = LabelEncoder()
data['direction'] = le_direction.fit_transform(data['direction'])

le_recovery = LabelEncoder()
data['Recovery'] = le_recovery.fit_transform(data['Recovery'])

# Normalize the feature columns
scaler = MinMaxScaler()
feature_columns = ['vw', 'o', 'c', 'h', 'l', 'n', 'prev_close', 'daily_return', 'abs_daily_return', 'pct_change']
data[feature_columns] = scaler.fit_transform(data[feature_columns])

def create_sequences(data, feature_columns, target_columns):
    sequences = []
    targets = []

    # Get unique outlier IDs
    outlier_ids = data['outlier_id'].unique()

    for outlier_id in outlier_ids:
        # Get prior day data for the current outlier ID
        prior_data = data[(data['outlier_id'] == outlier_id) & (data['day type'] == 'prior day')]
        # Get post day data for the current outlier ID
        post_data = data[(data['outlier_id'] == outlier_id) & (data['day type'] == 'post day')]

        # Ensure there is enough data for both prior and post days
        if not prior_data.empty and not post_data.empty:
            # Extract feature sequences from prior day data
            seq = prior_data[feature_columns].values
            # Extract target sequences from post day data
            target = post_data[target_columns].values

            sequences.append(seq)
            targets.append(target)

    # Pad sequences to ensure they have the same length
    max_seq_length = max(len(seq) for seq in sequences)
    max_target_length = max(len(target) for target in targets)

    sequences_padded = pad_sequences(sequences, maxlen=max_seq_length, dtype='float32', padding='post', truncating='post')
    targets_padded = pad_sequences(targets, maxlen=max_target_length, dtype='float32', padding='post', truncating='post')

    return np.array(sequences_padded), np.array(targets_padded), max_seq_length, max_target_length

# Example usage with your data
target_columns = ['direction', 'Recovery']

X_train, y_train, max_seq_length, max_target_length = create_sequences(data, feature_columns, target_columns)

# Define the LSTM model
model = Sequential()
model.add(LSTM(128, activation='relu', input_shape=(max_seq_length, len(feature_columns))))
model.add(RepeatVector(max_target_length))
model.add(LSTM(128, activation='relu', return_sequences=True))
model.add(TimeDistributed(Dense(len(target_columns))))

model.compile(optimizer='adam', loss='mse')

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

  super().__init__(**kwargs)


Epoch 1/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 971ms/step - loss: nan - val_loss: nan
Epoch 2/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 570ms/step - loss: nan - val_loss: nan
Epoch 3/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 500ms/step - loss: nan - val_loss: nan
Epoch 4/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 521ms/step - loss: nan - val_loss: nan
Epoch 5/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 490ms/step - loss: nan - val_loss: nan
Epoch 6/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 550ms/step - loss: nan - val_loss: nan
Epoch 7/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 495ms/step - loss: nan - val_loss: nan
Epoch 8/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 556ms/step - loss: nan - val_loss: nan
Epoch 9/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 708ms/step - loss: nan - val_l

<keras.src.callbacks.history.History at 0x2a206e510>

In [19]:
# Predict function for new data
def predict_outlier_direction_recovery(prior_data, model, scaler, le_direction, le_recovery):
    prior_data_scaled = scaler.transform(prior_data[feature_columns])
    prior_data_sequence = np.expand_dims(prior_data_scaled, axis=0)
    prediction = model.predict(prior_data_sequence)
    direction_pred = le_direction.inverse_transform(prediction[0, :, 0].astype(int))
    recovery_pred = le_recovery.inverse_transform(prediction[0, :, 1].astype(int))
    return direction_pred, recovery_pred

# Example prediction using the prior_data for testing
test_prior_data = prior_data[feature_columns].iloc[-max_seq_length:]
direction_pred, recovery_pred = predict_outlier_direction_recovery(test_prior_data, model, scaler, le_direction, le_recovery)

print("Predicted Directions:", direction_pred)
print("Predicted Recoveries:", recovery_pred)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
Predicted Directions: ['Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down'
 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down'
 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down'
 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down'
 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down'
 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down'
 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down'
 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down'
 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down'
 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down'
 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down'
 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down'
 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down'

  direction_pred = le_direction.inverse_transform(prediction[0, :, 0].astype(int))
  recovery_pred = le_recovery.inverse_transform(prediction[0, :, 1].astype(int))
