In [1]:
import numpy as np
import pandas as pd
import requests
from pandas.tseries.offsets import BDay

In [2]:
def calculate_daily_returns(df, prev_close=None):
    if prev_close is not None:
        df.loc[df.index[0], 'prev_close'] = prev_close
    else:
        df['prev_close'] = df['c'].shift(1)
    df['daily_return'] = (df['c'] - df['prev_close']) / df['prev_close']
    df['abs_daily_return'] = df['daily_return'].abs()
    return df

In [3]:
#get the buffer oct and novd 2009 data
# Define the start and end dates for the period
start_date = '2009-10-01'
end_date = '2009-12-31'
pair = 'C:USDCHF'
api_key = 'beBybSi8daPgsTp5yx5cHtHpYcrjp5Jq'

# Format the API endpoint
url = f"https://api.polygon.io/v2/aggs/ticker/{pair}/range/1/day/{start_date}/{end_date}?adjusted=true&sort=asc&limit=50000&apiKey={api_key}"

# Make the API request
response = requests.get(url)
data = response.json()

# Check if the request was successful
if response.status_code == 200 and 'results' in data:
    # Load data into a DataFrame
    df_2009 = pd.DataFrame(data['results'])
    # Convert timestamps
    df_2009['date'] = pd.to_datetime(df_2009['t'], unit='ms')
    df_2009.drop(columns=['t'], inplace=True)

    df_2009 = df_2009[df_2009['date'].dt.weekday < 5]
    
    # Calculating returns (using your calculate_daily_returns function)
    df_2009 = calculate_daily_returns(df_2009)

    
    df_2009['year'] = df_2009['date'].dt.year
    df_2009['day'] = df_2009['date'].dt.day_name()
    
    df_2009['is_outlier'] = 0

df_2009    

Unnamed: 0,v,vw,o,c,h,l,n,date,prev_close,daily_return,abs_daily_return,year,day,is_outlier
0,109403,1.0409,1.03458,1.04255,1.04509,1.03366,109403,2009-10-01,,,,2009,Thursday,0
1,110132,1.0375,1.04255,1.03531,1.04359,1.03087,110132,2009-10-02,1.04255,-0.006945,0.006945,2009,Friday,0
3,71818,1.0329,1.03425,1.03150,1.03568,1.03055,71818,2009-10-05,1.03531,-0.003680,0.003680,2009,Monday,0
4,86846,1.0264,1.03154,1.02664,1.03178,1.02357,86846,2009-10-06,1.03150,-0.004712,0.004712,2009,Tuesday,0
5,97209,1.0307,1.02662,1.03109,1.03583,1.02630,97209,2009-10-07,1.02664,0.004335,0.004335,2009,Wednesday,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,25929,1.0364,1.03905,1.03679,1.03947,1.03196,25929,2009-12-24,1.03905,-0.002175,0.002175,2009,Thursday,0
74,32105,1.0350,1.03659,1.03524,1.03876,1.03273,32105,2009-12-28,1.03679,-0.001495,0.001495,2009,Monday,0
75,40126,1.0332,1.03530,1.03758,1.03845,1.02780,40126,2009-12-29,1.03524,0.002260,0.002260,2009,Tuesday,0
76,41117,1.0381,1.03758,1.03613,1.04211,1.03505,41117,2009-12-30,1.03758,-0.001397,0.001397,2009,Wednesday,0


In [4]:
# Define the API key and base URL
api_key = 'beBybSi8daPgsTp5yx5cHtHpYcrjp5Jq'
full_data = pd.DataFrame()
outliers_data = pd.DataFrame()

# Define the currency pairs and years
pair = "C:USDCHF"
years = range(2010, 2024)

# Loop over each year
for year in years:
    # Format the API endpoint
    start_date = f'{year}-01-01'
    end_date = f'{year}-12-31'
    url = f"https://api.polygon.io/v2/aggs/ticker/{pair}/range/1/day/{start_date}/{end_date}?adjusted=true&sort=asc&limit=50000&apiKey={api_key}"
    
    # Make the API request
    response = requests.get(url)
    data = response.json()
    
    # Check if the request was successful
    if response.status_code == 200 and 'results' in data:
        # Load data into a DataFrame
        df = pd.DataFrame(data['results'])
        # Convert timestamps
        df['date'] = pd.to_datetime(df['t'], unit='ms')
        df.drop(columns=['t'], inplace=True)

        df = df[df['date'].dt.weekday < 5]
        # Calculating returns
        df = calculate_daily_returns(df)

        # Append the data to the full_data DataFrame for the current currency pair
        df['year'] = year
        df['day'] = df['date'].dt.day_name()
        # Find the top 10 outliers based on absolute values of the daily return value
        df['abs_daily_return'] = df['c'].abs()
        top_10_outliers = df.nlargest(10, 'abs_daily_return')
        outlier_dates = top_10_outliers['date']

        # Create a new column 'is_outlier' in the full_data DataFrame
        df['is_outlier'] = df['date'].isin(outlier_dates).astype(int)
        full_data = pd.concat([full_data, df], ignore_index=True)
        
        # Append outliers to the outliers_data DataFrame for the current currency pair
        top_10_outliers['year'] = year
        outliers_data = pd.concat([outliers_data, top_10_outliers], ignore_index=True)
        

sorted_full_data = full_data.sort_values(by="date")
sorted_outliers_data = outliers_data.sort_values(by="date")

In [5]:
sorted_full_data

Unnamed: 0,v,vw,o,c,h,l,n,date,prev_close,daily_return,abs_daily_return,year,day,is_outlier
0,69913,1.0329,1.03606,1.02891,1.041960,1.02621,69913,2010-01-04,,,1.02891,2010,Monday,0
1,76822,1.0304,1.02900,1.03341,1.035960,1.02515,76822,2010-01-05,1.02891,0.004374,1.03341,2010,Tuesday,0
2,81057,1.0321,1.03341,1.02746,1.037830,1.02480,81057,2010-01-06,1.03341,-0.005758,1.02746,2010,Wednesday,0
3,72488,1.0321,1.02744,1.03370,1.037050,1.02414,72488,2010-01-07,1.02746,0.006073,1.03370,2010,Thursday,0
4,73991,1.0311,1.03375,1.02325,1.038370,1.02151,73991,2010-01-08,1.03370,-0.010109,1.02325,2010,Friday,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3568,12916,0.8563,0.85534,0.85581,0.858300,0.85380,12916,2023-12-25,0.85570,0.000129,0.85581,2023,Monday,0
3569,112192,0.8555,0.85600,0.85372,0.857960,0.85280,112192,2023-12-26,0.85581,-0.002442,0.85372,2023,Tuesday,0
3570,174820,0.8495,0.85359,0.84250,0.854870,0.84060,174820,2023-12-27,0.85372,-0.013142,0.84250,2023,Wednesday,0
3571,196178,0.8392,0.84240,0.84337,0.845145,0.83300,196178,2023-12-28,0.84250,0.001033,0.84337,2023,Thursday,0


In [6]:
#joining both the buffer and main dataframe to create the final dataset

# Combine the historical data (df_2009) with the original dataset (sorted_full_data)
main_sorted_full_data = pd.concat([df_2009, sorted_full_data], ignore_index=True)

# Sort by date to ensure chronological order
main_sorted_full_data = main_sorted_full_data.sort_values(by='date').reset_index(drop=True)

In [7]:
main_sorted_full_data

Unnamed: 0,v,vw,o,c,h,l,n,date,prev_close,daily_return,abs_daily_return,year,day,is_outlier
0,109403,1.0409,1.03458,1.04255,1.045090,1.03366,109403,2009-10-01,,,,2009,Thursday,0
1,110132,1.0375,1.04255,1.03531,1.043590,1.03087,110132,2009-10-02,1.04255,-0.006945,0.006945,2009,Friday,0
2,71818,1.0329,1.03425,1.03150,1.035680,1.03055,71818,2009-10-05,1.03531,-0.003680,0.003680,2009,Monday,0
3,86846,1.0264,1.03154,1.02664,1.031780,1.02357,86846,2009-10-06,1.03150,-0.004712,0.004712,2009,Tuesday,0
4,97209,1.0307,1.02662,1.03109,1.035830,1.02630,97209,2009-10-07,1.02664,0.004335,0.004335,2009,Wednesday,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3633,12916,0.8563,0.85534,0.85581,0.858300,0.85380,12916,2023-12-25,0.85570,0.000129,0.855810,2023,Monday,0
3634,112192,0.8555,0.85600,0.85372,0.857960,0.85280,112192,2023-12-26,0.85581,-0.002442,0.853720,2023,Tuesday,0
3635,174820,0.8495,0.85359,0.84250,0.854870,0.84060,174820,2023-12-27,0.85372,-0.013142,0.842500,2023,Wednesday,0
3636,196178,0.8392,0.84240,0.84337,0.845145,0.83300,196178,2023-12-28,0.84250,0.001033,0.843370,2023,Thursday,0


In [8]:
sorted_outliers_data

Unnamed: 0,v,vw,o,c,h,l,n,date,prev_close,daily_return,abs_daily_return,year,day
2,116306,1.1565,1.15104,1.16024,1.16191,1.14913,116306,2010-05-24,1.149110,0.009686,1.16024,2010,Monday
5,176660,1.1625,1.16023,1.15739,1.16950,1.15178,176660,2010-05-25,1.160240,-0.002456,1.15739,2010,Tuesday
4,135028,1.1579,1.15739,1.15873,1.16230,1.15139,135028,2010-05-26,1.157390,0.001158,1.15873,2010,Wednesday
3,103597,1.1541,1.15223,1.15927,1.15929,1.14801,103597,2010-05-28,1.152140,0.006188,1.15927,2010,Friday
6,62078,1.1559,1.15743,1.15577,1.15825,1.15302,62078,2010-05-31,1.159270,-0.003019,1.15577,2010,Monday
...,...,...,...,...,...,...,...,...,...,...,...,...,...
135,168297,0.9389,0.94201,0.93949,0.94290,0.93480,168297,2023-03-01,0.941868,-0.002524,0.93949,2023,Wednesday
132,164385,0.9419,0.93950,0.94180,0.94400,0.93890,164385,2023-03-02,0.939490,0.002459,0.94180,2023,Thursday
138,156110,0.9387,0.94197,0.93590,0.94210,0.93520,156110,2023-03-03,0.941800,-0.006265,0.93590,2023,Friday
130,172706,0.9356,0.93069,0.94223,0.94260,0.92830,172706,2023-03-07,0.930550,0.012552,0.94223,2023,Tuesday


# Technical Indicators

### 1. **10-day Moving Average (MA)**
A moving average smooths out price data by creating a constantly updated average price over a specific time period (10 days in this case). This average is typically calculated daily for the last 10 days of closing prices.

**Logic**: The 10-day MA provides a smoothed indicator of short-term price trends, reducing the noise of daily price fluctuations. It's often used to identify the direction of the trend; prices above the MA suggest a bullish trend, while prices below indicate a bearish trend.

### Calculation:
$\text{10-day MA} = \frac{\text{Sum of close prices over the last 10 days}}{10}$

In [9]:
# window = 5 # or 10 or 20

# df['10-day MA'] = df['Close'].rolling(window=window).mean()

### 2. **30-day Volatility**
Volatility measures the degree of variation in trading prices over time, typically calculated using the standard deviation of returns.

**Logic**: Higher volatility indicates higher risk but also the potential for higher returns. It is crucial for adjusting the size of trades and setting stop-loss and take-profit orders.

### Calculation:
First, compute daily returns as:
$\text{Return}_t = \frac{\text{Close Price}_t - \text{Close Price}_{t-1}}{\text{Close Price}_{t-1}}$

Then, calculate the standard deviation of these returns over the last 30 days:
$\text{30-day Volatility} = \text{Standard Deviation of Returns over the last 30 days}$

In [10]:
# # Assuming 'df' is your DataFrame and 'c' is your close price column
# df['Daily Return'] = df['c'].pct_change()  # For simple returns
# # or
# df['Log Daily Return'] = np.log(df['c'] / df['c'].shift(1))  # For logarithmic returns

# # Number of Forex trading days per year
# forex_trading_days = 260  # or a more precise count specific to your data and holidays observed

# # Calculate annualized volatility
# df['30-day Volatility'] = df['Daily Returns'].rolling(window=30).std() * np.sqrt(forex_trading_days)

### 3. **Relative Strength Index (RSI)**
RSI is a momentum oscillator that measures the speed and change of price movements on a scale of 0 to 100.

**Logic**: The RSI is used to identify overbought or oversold conditions in the trading of an asset.
- **Overbought** (typically RSI > 70) — might indicate a price drop soon.
- **Oversold** (typically RSI < 30) — might indicate a price rise soon.

### Calculation:
First, calculate the average gains and losses over the last 14 days:
$\text{RSI} = 100 - \left( \frac{100}{1 + \frac{\text{Average Gain}}{\text{Average Loss}}} \right)$

In [11]:
# def calculate_rsi(data, window=14):
#     delta = data.diff()
#     gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
#     loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()

#     rs = gain / loss
#     rsi = 100 - (100 / (1 + rs))
#     return rsi

# df['RSI'] = calculate_rsi(df['Close'])

<!-- ### 4. **Moving Average Convergence Divergence (MACD)**
MACD is a trend-following momentum indicator that shows the relationship between two moving averages of a security’s price.

**Logic**: The MACD is calculated by subtracting the 26-day Exponential Moving Average (EMA) from the 12-day EMA. The result of that subtraction is the MACD line. A nine-day EMA of the MACD called the "signal line," is then plotted on top of the MACD line, which can function as a trigger for buy and sell signals.

### Calculation:
$\text{MACD Line} = \text{12-day EMA} - \text{26-day EMA}$
$\text{Signal Line} = \text{9-day EMA of MACD Line}$ -->

In [12]:
# def calculate_macd(data, slow=26, fast=12, signal=9):
#     exp1 = data.ewm(span=fast, adjust=False).mean()
#     exp2 = data.ewm(span=slow, adjust=False).mean()
#     macd = exp1 - exp2
#     signal_line = macd.ewm(span=signal, adjust=False).mean()
#     return macd, signal_line

# df['MACD'], df['Signal Line'] = calculate_macd(df['Close'])

### 5. **Label (Next Day Price Up/Down)**
This label is used as the target variable for a predictive model, indicating whether the price will go up or down the next day relative to the current day’s closing price.

**Logic**: This is a straightforward classification problem where you predict a binary outcome; the price will either be higher or lower the following day compared to the closing price of the current day.

### Calculation:
$\text{Label} = 
  \begin{cases} 
   1 & \text{if } \text{Next Day Close} > \text{Current Day Close} \\
   0 & \text{otherwise}
  \end{cases}$

## Techinal Indicators Calculations

In [13]:
# Calculate the 10-day Moving Average (MA) starting from the first available data point in 2010
main_sorted_full_data['10-day MA'] = main_sorted_full_data['c'].rolling(window=10).mean()

forex_trading_days = 260
# Calculate the 30-day volatility (standard deviation of returns) starting from the first available data point in 2010
main_sorted_full_data['30-day Volatility'] = main_sorted_full_data['daily_return'].rolling(window=30).std() * np.sqrt(forex_trading_days)

def calculate_rsi(data, window=14):
    # Calculate the difference between the current and previous closing prices
    delta = data.diff()
    
    # Separate the gains (positive changes) and losses (negative changes)
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()

    # Calculate the Relative Strength (RS)
    rs = gain / loss
    
    # Calculate the RSI using the formula
    rsi = 100 - (100 / (1 + rs))
    return rsi

# Calculate the RSI with a 14-day window
main_sorted_full_data['RSI'] = calculate_rsi(main_sorted_full_data['c'], window=14)


# Create the 'Price_Up_Down' label based on whether the next day's price is higher (1) or lower (0)
main_sorted_full_data['Price_Up_Down'] = (main_sorted_full_data['c'].shift(-1) > main_sorted_full_data['c']).astype(int)

# Filter out data from 2009 if needed (optional)
sorted_full_data_with_indicators = main_sorted_full_data[main_sorted_full_data['date'] >= '2010-01-01'].copy()

## Final Dataset with all techinal indicators

In [14]:
sorted_full_data_with_indicators

Unnamed: 0,v,vw,o,c,h,l,n,date,prev_close,daily_return,abs_daily_return,year,day,is_outlier,10-day MA,30-day Volatility,RSI,Price_Up_Down
65,69913,1.0329,1.03606,1.02891,1.041960,1.02621,69913,2010-01-04,,,1.02891,2010,Monday,0,1.038591,,45.330339,1
66,76822,1.0304,1.02900,1.03341,1.035960,1.02515,76822,2010-01-05,1.02891,0.004374,1.03341,2010,Tuesday,0,1.037668,,51.991311,0
67,81057,1.0321,1.03341,1.02746,1.037830,1.02480,81057,2010-01-06,1.03341,-0.005758,1.02746,2010,Wednesday,0,1.035867,,38.022523,1
68,72488,1.0321,1.02744,1.03370,1.037050,1.02414,72488,2010-01-07,1.02746,0.006073,1.03370,2010,Thursday,0,1.034403,,45.471928,0
69,73991,1.0311,1.03375,1.02325,1.038370,1.02151,73991,2010-01-08,1.03370,-0.010109,1.02325,2010,Friday,0,1.032823,,30.990099,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3633,12916,0.8563,0.85534,0.85581,0.858300,0.85380,12916,2023-12-25,0.85570,0.000129,0.85581,2023,Monday,0,0.863986,0.060472,23.933565,0
3634,112192,0.8555,0.85600,0.85372,0.857960,0.85280,112192,2023-12-26,0.85581,-0.002442,0.85372,2023,Tuesday,0,0.861848,0.048946,22.794404,0
3635,174820,0.8495,0.85359,0.84250,0.854870,0.84060,174820,2023-12-27,0.85372,-0.013142,0.84250,2023,Wednesday,0,0.859001,0.060001,16.314069,1
3636,196178,0.8392,0.84240,0.84337,0.845145,0.83300,196178,2023-12-28,0.84250,0.001033,0.84337,2023,Thursday,0,0.856630,0.059782,10.057586,0


In [15]:
sorted_full_data_with_indicators.to_csv('dataset.csv')