After observing the yahoo finance package, found some days' closing price are not correct. So, the extra script help to fix the data. There is no need to download and fix manully everytime. I already email to the yahoo finance to fix this data. If they fix, then there will be no use of this first script in future.


In [2]:
import yfinance as yf
import pandas as pd

def download_and_fix_eurusd_close():
    # 1. Download EUR/USD data
    print("Downloading EUR/USD data...")
    ticker = "EURUSD=X"
    df = yf.download(ticker, start="2000-01-01", progress=False)

    df = df[['Close']].copy()

    # Flatten columns if multi-index (common in yfinance)
    if isinstance(df.columns, pd.MultiIndex):
        print("Flattening columns...")
        df.columns = df.columns.get_level_values(0)

    # Key: Date (YYYY-MM-DD), Value: Corrected Close Price
    # Taken from https://www.kaggle.com/datasets/lehenzehra/eurusd-daily-data-ohlc?select=EURUSD_D1_Sorted.csv
    corrections = {
        "2008-01-08": 1.4705,
        "2008-02-08": 1.4503,
        "2008-08-08": 1.5074,
        "2008-09-08": 1.4250,
        "2008-10-08": 1.3650,
        "2008-12-08": 1.2930,
    }

    print("\nApplying corrections...")

    # Apply corrections
    for date_str, price in corrections.items():
        dt = pd.Timestamp(date_str)
        if dt in df.index:
            print(f"Fixing {date_str}: Old={df.at[dt, 'Close']:.4f} -> New={price:.4f}")
            df.at[dt, 'Close'] = price
        else:
            print(f"Warning: {date_str} not found in data.")


    # Save to CSV
    output_file = "EURUSD_Close_Fixed.csv"
    df.to_csv(output_file)
    print(f"\nDone. Saved to {output_file}")


download_and_fix_eurusd_close()

Downloading EUR/USD data...


  df = yf.download(ticker, start="2000-01-01", progress=False)


Flattening columns...

Applying corrections...
Fixing 2008-01-08: Old=1.5571 -> New=1.4705
Fixing 2008-02-08: Old=1.5571 -> New=1.4513
Fixing 2008-08-08: Old=1.5049 -> New=1.5074
Fixing 2008-09-08: Old=1.5050 -> New=1.4250
Fixing 2008-10-08: Old=1.4957 -> New=1.3650
Fixing 2008-12-08: Old=1.4918 -> New=1.2930

Done. Saved to EURUSD_Close_Fixed.csv


Load the corrected data and make a 'difference' percentafge column based on the closing price.

In [21]:
df = pd.read_csv("EURUSD_Close_Fixed.csv", index_col=0, parse_dates=True)
df['difference'] = df['Close'].pct_change() * 100
df.head()
df.to_csv('EURUSD_Close_Fixed_with_difference.csv')


The main script started from here. Creating main data table for calculating Euclidean Distane.

In [12]:
data = df['difference'].dropna().tolist()

main_data_rows = []
window_size = 6

# Iterate through the data to create feature vectors and target values
for i in range(len(data) - window_size + 1):
    row = data[i : i + window_size]
    feature_vector = row[:5]
    true_value = row[5]
    main_data_rows.append(feature_vector + [true_value])

# Create column names for the new DataFrame
column_names = [f'p{j+1}' for j in range(5)] + ['true_value_next_day']

# Create the 'main_data' DataFrame
main_data = pd.DataFrame(main_data_rows, columns=column_names)

# Set the DataFrame index to start from 1
main_data.index = range(1, len(main_data) + 1)

#Saving "Main Data Table"
main_data.to_csv('main_data_table.csv', index_label='Vectors')
print("Saved 'main_data_table.csv'.")

Saved 'main_data_table.csv'.


After forming the "difference" column as vectors, calcultaing the Euclidean Distance. And do statics of the positive and negative outcomes.

In [33]:
import numpy as np
import pandas as pd

base_vector = [0.45, 0.98, 0.29, 0.59, 0.38]
THRESHOLD = 1.3

accepted_forecasts = []

base_np_vector = np.array(base_vector)

print("Calculating Euclidean distances and filtering forecasts...")

for index, row in main_data.iterrows():
    feature_vector = np.array(row[['p1', 'p2', 'p3', 'p4', 'p5']].tolist())
    euclidean_distance = np.linalg.norm(feature_vector - base_np_vector)

    if euclidean_distance < THRESHOLD:
        accepted_forecasts.append(row['true_value_next_day'])

accepted_forecasts_df = pd.DataFrame(accepted_forecasts, columns=['Accepted'])
accepted_forecasts_df.index = range(1, len(accepted_forecasts_df) + 1)

accepted_forecasts_df.to_csv('accepted_forecasts.csv', index_label='Vectors')

print(f"\nTotal accepted forecasts: {len(accepted_forecasts_df)}")
print("Accepted Forecasts Table:")
display(accepted_forecasts_df.head())

accepted_series = pd.Series(accepted_forecasts)

positive_outcomes = (accepted_series > 0).sum()
negative_outcomes = (accepted_series <= 0).sum()

total_outcomes = len(accepted_series)

if total_outcomes > 0:
    percentage_positive = (positive_outcomes / total_outcomes) * 100
    percentage_negative = (negative_outcomes / total_outcomes) * 100
    print(f"\nPercentage of Positive Outcomes: {percentage_positive:.2f}%")
    print(f"Percentage of Negative Outcomes: {percentage_negative:.2f}%")
    print(f"Number of Positive Accepted Forecasts: {positive_outcomes}")
    print(f"Number of Negative Accepted Forecasts: {negative_outcomes}")
else:
    print("\nNo accepted forecasts to calculate percentages.")

Calculating Euclidean distances and filtering forecasts...

Total accepted forecasts: 1166
Accepted Forecasts Table:


Unnamed: 0,Accepted
1,0.244993
2,0.271373
3,0.333882
4,0.649868
5,0.168952



Percentage of Positive Outcomes: 47.26%
Percentage of Negative Outcomes: 52.74%
Number of Positive Accepted Forecasts: 551
Number of Negative Accepted Forecasts: 615


This script is optional.

In [28]:
results = []

for index, row in main_data.iterrows():
    feature_vector = np.array(row[['p1', 'p2', 'p3', 'p4', 'p5']].tolist())
    euclidean_distance = np.linalg.norm(feature_vector - base_np_vector)

    status = 'ACCEPTED' if euclidean_distance < THRESHOLD else 'Rejected'
    forecast = row['true_value_next_day']

    results.append({
        'Vectors': index,
        'Euclidean Dist': euclidean_distance,
        'Forecast': forecast,
        'Status': status
    })

output_df = pd.DataFrame(results).set_index('Vectors')
output_df.to_csv('output_table.csv', index=True)
print("Generated 'output_table.csv'.")
display(output_df.head())

Generated 'output_table.csv'.


Unnamed: 0_level_0,Euclidean Dist,Forecast,Status
Vectors,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1.153922,0.244993,ACCEPTED
2,1.501189,-0.481539,Rejected
3,1.227564,0.271373,ACCEPTED
4,1.359043,0.449403,Rejected
5,1.12862,0.333882,ACCEPTED


In [32]:
print(f"Number of Positive Accepted Forecasts: {positive_outcomes}")
print(f"Number of Negative Accepted Forecasts: {negative_outcomes}")

Number of Positive Accepted Forecasts: 551
Number of Negative Accepted Forecasts: 615


In [29]:
rejected_forecasts_count = output_df[output_df['Status'] == 'Rejected'].shape[0]
print(f"Number of Rejected Forecasts: {rejected_forecasts_count}")

Number of Rejected Forecasts: 4551
