After observing the yahoo finance package, found some days' closing price are not correct. So, the extra script help to fix the data. There is no need to download and fix manully everytime. I already email to the yahoo finance to fix this data. If they fix, then there will be no use of this first script in future.


In [10]:
import yfinance as yf
import pandas as pd

def download_and_fix_eurusd_close():
    # Download EUR/USD data
    print("Downloading EUR/USD data...")
    ticker = "EURUSD=X"
    df = yf.download(ticker, start="2000-01-01", progress=False)

    df = df[['Close']].copy()

    # Flatten columns if multi-index (common in yfinance)
    if isinstance(df.columns, pd.MultiIndex):
        print("Flattening columns...")
        df.columns = df.columns.get_level_values(0)

    # Taken from https://www.kaggle.com/datasets/lehenzehra/eurusd-daily-data-ohlc?select=EURUSD_D1_Sorted.csv
    corrections = {
        "2008-01-08": 1.4705,
        "2008-02-08": 1.4503,
        "2008-08-08": 1.5074,
        "2008-09-08": 1.4250,
        "2008-10-08": 1.3650,
        "2008-12-08": 1.2930,
    }

    print("\nApplying corrections...")

    # Apply corrections
    for date_str, price in corrections.items():
        dt = pd.Timestamp(date_str)
        if dt in df.index:
            print(f"Fixing {date_str}: Old={df.at[dt, 'Close']:.4f} -> New={price:.4f}")
            df.at[dt, 'Close'] = price
        else:
            print(f"Warning: {date_str} not found in data.")


    # Save to CSV
    output_file = "EURUSD_Close_Fixed.csv"
    df.to_csv(output_file)
    print(f"\nDone. Saved to {output_file}")


download_and_fix_eurusd_close()

Downloading EUR/USD data...


  df = yf.download(ticker, start="2000-01-01", progress=False)


Flattening columns...

Applying corrections...
Fixing 2008-01-08: Old=1.5571 -> New=1.4705
Fixing 2008-02-08: Old=1.5571 -> New=1.4503
Fixing 2008-08-08: Old=1.5049 -> New=1.5074
Fixing 2008-09-08: Old=1.5050 -> New=1.4250
Fixing 2008-10-08: Old=1.4957 -> New=1.3650
Fixing 2008-12-08: Old=1.4918 -> New=1.2930

Done. Saved to EURUSD_Close_Fixed.csv


Load the corrected data and make a 'difference' percentafge column based on the closing price.

In [11]:
df = pd.read_csv("EURUSD_Close_Fixed.csv", index_col=0, parse_dates=True)
df['difference'] = df['Close'].pct_change() * 100
df.head()
df.to_csv('EURUSD_Close_Fixed_with_difference.csv')


The main script started from here. Creating main data table for calculating Euclidean Distane.

In [12]:
data = df['difference'].dropna().tolist()

main_data_rows = []
window_size = 6

# Iterate through the data to create feature vectors and target values
for i in range(len(data) - window_size + 1):
    row = data[i : i + window_size]
    feature_vector = row[:5]
    true_value = row[5]
    main_data_rows.append(feature_vector + [true_value])

# Create column names for the new DataFrame
column_names = [f'p{j+1}' for j in range(5)] + ['true_value_next_day']

# Create the 'main_data' DataFrame
main_data = pd.DataFrame(main_data_rows, columns=column_names)

# Set the DataFrame index to start from 1
main_data.index = range(1, len(main_data) + 1)

#Saving "Main Data Table"
main_data.to_csv('main_data_table.csv', index_label='Vectors')
print("Saved 'main_data_table.csv'.")

Saved 'main_data_table.csv'.


After forming the "difference" column as vectors, calcultaing the Euclidean Distance. And do statics of the positive and negative outcomes.

In [19]:
import numpy as np

#Create base_vector range -2 to 2, each time take 5 datapoints for a vectors.
generated_base_vectors = np.random.uniform(low=-2.0, high=2.0, size=(1000, 5))

print("Generated Base Vectors:")
print(generated_base_vectors[:5])
base_vector = [0.45, 0.98, 0.29, 0.59, 0.38]
base_np_vector = np.array(base_vector)

results_summary = []

for current_threshold in np.arange(0.5, 1.51, 0.1):
    print(f"\nCalculating for THRESHOLD = {current_threshold:.1f}...")
    accepted_forecasts = []

    for index, row in main_data.iterrows():
        feature_vector = np.array(row[['p1', 'p2', 'p3', 'p4', 'p5']].tolist())
        euclidean_distance = np.linalg.norm(feature_vector - base_np_vector)

        if euclidean_distance < current_threshold:
            accepted_forecasts.append(row['true_value_next_day'])

    accepted_series = pd.Series(accepted_forecasts)

    positive_outcomes = (accepted_series > 0).sum()
    negative_outcomes = (accepted_series <= 0).sum()

    total_outcomes = len(accepted_series)

    percentage_positive = (positive_outcomes / total_outcomes) * 100 if total_outcomes > 0 else 0
    percentage_negative = (negative_outcomes / total_outcomes) * 100 if total_outcomes > 0 else 0

    results_summary.append({
        'THRESHOLD': f"{current_threshold:.1f}",
        'Total Accepted Forecasts': total_outcomes,
        'Accepted Forecasts': accepted_forecasts,
        'Positive Outcomes': positive_outcomes,
        'Negative Outcomes': negative_outcomes,
        'Percentage Positive': f"{percentage_positive:.2f}%",
        'Percentage Negative': f"{percentage_negative:.2f}%"
    })

summary_df = pd.DataFrame(results_summary)
print("\nSummary of Forecasts for Different THRESHOLDS:")
display(summary_df)


Generated Base Vectors:
[[ 0.97534702  0.57978037  0.00345735  0.06613451 -0.40081706]
 [ 0.06237488  1.93835844  1.92910193 -1.9290227  -0.0395619 ]
 [-0.08018825 -1.18866299 -1.78185288 -0.09412063 -1.35491482]
 [-1.71816049  0.75643618  0.79012925  0.15388835  1.18935306]
 [-0.75016933  0.55257421 -0.04727512  0.94986208  0.49025023]]

Calculating for THRESHOLD = 0.5...

Calculating for THRESHOLD = 0.6...

Calculating for THRESHOLD = 0.7...

Calculating for THRESHOLD = 0.8...

Calculating for THRESHOLD = 0.9...

Calculating for THRESHOLD = 1.0...

Calculating for THRESHOLD = 1.1...

Calculating for THRESHOLD = 1.2...

Calculating for THRESHOLD = 1.3...

Calculating for THRESHOLD = 1.4...

Calculating for THRESHOLD = 1.5...

Summary of Forecasts for Different THRESHOLDS:


Unnamed: 0,THRESHOLD,Total Accepted Forecasts,Accepted Forecasts,Positive Outcomes,Negative Outcomes,Percentage Positive,Percentage Negative
0,0.5,3,"[3.458648074491144, -0.8584777658985021, -0.00...",1,2,33.33%,66.67%
1,0.6,13,"[0.32685827315346305, 0.5400312922963169, 3.45...",7,6,53.85%,46.15%
2,0.7,33,"[-0.033107021021305805, 0.32685827315346305, 0...",18,15,54.55%,45.45%
3,0.8,70,"[-0.30591636296535274, -0.033107021021305805, ...",33,37,47.14%,52.86%
4,0.9,145,"[0.16895158701262236, -0.30591636296535274, 0....",64,81,44.14%,55.86%
5,1.0,291,"[0.16895158701262236, -0.30591636296535274, 0....",143,148,49.14%,50.86%
6,1.1,493,"[0.16895158701262236, -0.30591636296535274, 0....",229,264,46.45%,53.55%
7,1.2,788,"[0.24499343325119494, 0.33388176113964274, 0.1...",369,419,46.83%,53.17%
8,1.3,1166,"[0.24499343325119494, 0.27137291500392013, 0.3...",551,615,47.26%,52.74%
9,1.4,1610,"[0.24499343325119494, 0.27137291500392013, 0.4...",778,832,48.32%,51.68%


This script is optional.

In [14]:
results = []

for index, row in main_data.iterrows():
    feature_vector = np.array(row[['p1', 'p2', 'p3', 'p4', 'p5']].tolist())
    euclidean_distance = np.linalg.norm(feature_vector - base_np_vector)

    status = 'ACCEPTED' if euclidean_distance < THRESHOLD else 'Rejected'
    forecast = row['true_value_next_day']

    results.append({
        'Vectors': index,
        'Euclidean Dist': euclidean_distance,
        'Forecast': forecast,
        'Status': status
    })

output_df = pd.DataFrame(results).set_index('Vectors')
output_df.to_csv('output_table.csv', index=True)
print("Generated 'output_table.csv'.")
display(output_df.head())

Generated 'output_table.csv'.


Unnamed: 0_level_0,Euclidean Dist,Forecast,Status
Vectors,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1.153922,0.244993,ACCEPTED
2,1.501189,-0.481539,Rejected
3,1.227564,0.271373,ACCEPTED
4,1.359043,0.449403,Rejected
5,1.12862,0.333882,ACCEPTED


In [15]:
rejected_forecasts_count = output_df[output_df['Status'] == 'Rejected'].shape[0]
print(f"Number of Rejected Forecasts: {rejected_forecasts_count}")

Number of Rejected Forecasts: 4551
