After observing the yahoo finance package, found some days' closing price are not correct. So, the extra script help to fix the data. There is no need to download and fix manully everytime. I already email to the yahoo finance to fix this data. If they fix, then there will be no use of this first script in future.


In [1]:
import yfinance as yf
import pandas as pd

def download_and_fix_eurusd_close():
    # Download EUR/USD data
    print("Downloading EUR/USD data...")
    ticker = "EURUSD=X"
    df = yf.download(ticker, start="2000-01-01", progress=False)

    df = df[['Close']].copy()

    # Flatten columns if multi-index (common in yfinance)
    if isinstance(df.columns, pd.MultiIndex):
        print("Flattening columns...")
        df.columns = df.columns.get_level_values(0)

    # Taken from https://www.kaggle.com/datasets/lehenzehra/eurusd-daily-data-ohlc?select=EURUSD_D1_Sorted.csv
    corrections = {
        "2008-01-08": 1.4705,
        "2008-02-08": 1.4503,
        "2008-08-08": 1.5074,
        "2008-09-08": 1.4250,
        "2008-10-08": 1.3650,
        "2008-12-08": 1.2930,
    }

    print("\nApplying corrections...")

    # Apply corrections
    for date_str, price in corrections.items():
        dt = pd.Timestamp(date_str)
        if dt in df.index:
            print(f"Fixing {date_str}: Old={df.at[dt, 'Close']:.4f} -> New={price:.4f}")
            df.at[dt, 'Close'] = price
        else:
            print(f"Warning: {date_str} not found in data.")


    # Save to CSV
    output_file = "EURUSD_Close_Fixed.csv"
    df.to_csv(output_file)
    print(f"\nDone. Saved to {output_file}")


download_and_fix_eurusd_close()

Downloading EUR/USD data...


  df = yf.download(ticker, start="2000-01-01", progress=False)


Flattening columns...

Applying corrections...
Fixing 2008-01-08: Old=1.5571 -> New=1.4705
Fixing 2008-02-08: Old=1.5571 -> New=1.4503
Fixing 2008-08-08: Old=1.5049 -> New=1.5074
Fixing 2008-09-08: Old=1.5050 -> New=1.4250
Fixing 2008-10-08: Old=1.4957 -> New=1.3650
Fixing 2008-12-08: Old=1.4918 -> New=1.2930

Done. Saved to EURUSD_Close_Fixed.csv


Load the corrected data and make a 'difference' percentafge column based on the closing price.

In [2]:
df = pd.read_csv("EURUSD_Close_Fixed.csv", index_col=0, parse_dates=True)
df['difference'] = df['Close'].pct_change() * 100
df.head()
df.to_csv('EURUSD_Close_Fixed_with_difference.csv')


The main script started from here. Creating main data table for calculating Euclidean Distane.

In [3]:
data = df['difference'].dropna().tolist()

main_data_rows = []
window_size = 6

# Iterate through the data to create feature vectors and target values
for i in range(len(data) - window_size + 1):
    row = data[i : i + window_size]
    feature_vector = row[:5]
    true_value = row[5]
    main_data_rows.append(feature_vector + [true_value])

# Create column names for the new DataFrame
column_names = [f'p{j+1}' for j in range(5)] + ['true_value_next_day']

# Create the 'main_data' DataFrame
main_data = pd.DataFrame(main_data_rows, columns=column_names)

# Set the DataFrame index to start from 1
main_data.index = range(1, len(main_data) + 1)

#Saving "Main Data Table"
main_data.to_csv('main_data_table.csv', index_label='Vectors')
print("Saved 'main_data_table.csv'.")

Saved 'main_data_table.csv'.


After forming the "difference" column as vectors, calcultaing the Euclidean Distance. And do statics of the positive and negative outcomes.

In [4]:
import numpy as np
from datetime import datetime

#Create base_vector range -2 to 2, each time take 5 datapoints for a vectors.
generated_base_vectors = np.random.uniform(low=-2.0, high=2.0, size=(100, 5))


all_results = []
start_time = datetime.now()
print(f"Started at => {start_time}")
print(f"\n-----Calculating Euclidean Distance-----\n")
print("T: ", end="")
for current_threshold in np.arange(0.5, 1.51, 0.1):
    print(f"--> {current_threshold:.1f}", end="")

    # Iterate through each generated base vector
    for gen_base_vec_idx in range(generated_base_vectors.shape[0]):
        current_base_np_vector = generated_base_vectors[gen_base_vec_idx]

        for index, row in main_data.iterrows():
            feature_vector = np.array(row[['p1', 'p2', 'p3', 'p4', 'p5']].tolist())
            euclidean_distance = np.linalg.norm(feature_vector - current_base_np_vector)
            accepted_forecasts_for_this_base_vector = 0;

            if euclidean_distance < current_threshold:
                accepted_forecasts_for_this_base_vector = row['true_value_next_day']

                all_results.append({
                  'THRESHOLD': f"{current_threshold:.1f}",
                  'base_vector': current_base_np_vector,
                  'ED': euclidean_distance,
                  'accepted_forecast': accepted_forecasts_for_this_base_vector,
              })

result_df = pd.DataFrame(all_results)
print("\nResults Different THRESHOLDS:")
display(result_df)
result_df.to_csv('overall_reuslt.csv', index= True)
end_time = datetime.now()
print(f"Finished at => {end_time}")
print(f"Total time take => {end_time - start_time}")


Started at => 2025-12-22 19:27:04.792285

-----Calculating Euclidean Distance-----

T: --> 0.5--> 0.6--> 0.7--> 0.8--> 0.9--> 1.0--> 1.1--> 1.2--> 1.3--> 1.4--> 1.5
Results Different THRESHOLDS:


Unnamed: 0,THRESHOLD,base_vector,ED,accepted_forecast
0,0.5,"[-1.1618211092797597, 0.5759101042389605, 0.33...",0.464779,0.456565
1,0.5,"[-1.1618211092797597, 0.5759101042389605, 0.33...",0.386108,-0.013013
2,0.5,"[-1.1618211092797597, 0.5759101042389605, 0.33...",0.441916,-0.200164
3,0.5,"[0.3978989842404057, 0.468680234458994, 1.4019...",0.440527,0.401716
4,0.5,"[0.40954505081404813, 0.20723631841991796, -0....",0.489341,-0.220254
...,...,...,...,...
53201,1.5,"[-1.1997770987183145, 1.4877499677687864, -0.9...",1.349040,2.377235
53202,1.5,"[-1.1997770987183145, 1.4877499677687864, -0.9...",1.099267,0.387426
53203,1.5,"[-1.1997770987183145, 1.4877499677687864, -0.9...",1.491012,-0.464867
53204,1.5,"[-1.1997770987183145, 1.4877499677687864, -0.9...",1.391808,0.053689


Finished at => 2025-12-22 20:21:21.490760
Total time take => 0:54:16.698475


Findind the all accepeted forecasts based on the threshold. Also find the best base vector (B) which occurs most time.

In [5]:
aggregated_results = result_df.groupby('THRESHOLD').agg(
    total_accepted_forecasts=('accepted_forecast', 'count'),
    positive_forecasts=('accepted_forecast', lambda x: (x > 0).sum()),
    negative_forecasts=('accepted_forecast', lambda x: (x < 0).sum()),
    p_outcomes = ('accepted_forecast', lambda x: (x > 0).sum() / x.count() * 100 if x.count() > 0 else 0),
    n_outcomes = ('accepted_forecast', lambda x: (x < 0).sum() / x.count() * 100 if x.count() > 0 else 0)
).reset_index()

# Round P Outcomes and N Outcomes to two decimal places
aggregated_results['p_outcomes'] = aggregated_results['p_outcomes'].round(2)
aggregated_results['n_outcomes'] = aggregated_results['n_outcomes'].round(2)

# Convert numpy arrays in 'base_vector' to tuples to make them hashable for groupby
result_df['base_vector_tuple'] = result_df['base_vector'].apply(tuple)

# To find the 'best' base_vector for each threshold based on accepted forecasts,
base_vector_counts = result_df.groupby(['THRESHOLD', 'base_vector_tuple']).size().reset_index(name='count')

# Then, for each THRESHOLD, we find the base_vector that has the maximum 'count'
best_base_vectors_pt = base_vector_counts.loc[base_vector_counts.groupby('THRESHOLD')['count'].idxmax()]


# Rename the 'base_vector_tuple' column back to 'best_base_vector' and include the 'count' for merging
best_base_vectors_pt = best_base_vectors_pt[['THRESHOLD', 'base_vector_tuple', 'count']].rename(columns={'base_vector_tuple': 'best_base_vector', 'count': 'Occurrence Count'})

agg_results_df = pd.merge(aggregated_results, best_base_vectors_pt, on='THRESHOLD', how='left')

# Convert the 'best_base_vector' column to a cleaner string representation for CSV saving
agg_results_df['best_base_vector'] = agg_results_df['best_base_vector'].apply(
    lambda x: '[' + ', '.join([f'{val}' for val in x]) + ']'
)

column_summary = ["THRESHOLD", "Total Accepted Forecasts", "Positive", "Negative","P Outcomes", "N Outcomes", "Best Base Vector", "Occurrence Count"]
print("Aggregated Results by THRESHOLD (with Best Base Vector):")
agg_results_df.columns = column_summary
display(agg_results_df)
agg_results_df.to_csv('aggregated_results_with_best_vector.csv', index=False)

# Find the overall best THRESHOLD based on the most total accepted forecasts
best_threshold_row = agg_results_df.loc[agg_results_df['Total Accepted Forecasts'].idxmax()]

print("\nOverall Best THRESHOLD based on most accepted forecasts:")
display(best_threshold_row)

Aggregated Results by THRESHOLD (with Best Base Vector):


Unnamed: 0,THRESHOLD,Total Accepted Forecasts,Positive,Negative,P Outcomes,N Outcomes,Best Base Vector,Occurrence Count
0,0.5,36,17,19,47.22,52.78,"[-0.48553848415727874, -0.6257227693397263, 0....",9
1,0.6,102,47,55,46.08,53.92,"[-0.48553848415727874, -0.6257227693397263, 0....",25
2,0.7,268,127,141,47.39,52.61,"[-0.48553848415727874, -0.6257227693397263, 0....",68
3,0.8,595,289,304,48.57,51.09,"[-0.48553848415727874, -0.6257227693397263, 0....",154
4,0.9,1187,583,600,49.12,50.55,"[-0.48553848415727874, -0.6257227693397263, 0....",296
5,1.0,2057,1013,1035,49.25,50.32,"[-0.48553848415727874, -0.6257227693397263, 0....",481
6,1.1,3415,1686,1716,49.37,50.25,"[-0.48553848415727874, -0.6257227693397263, 0....",739
7,1.2,5562,2749,2795,49.42,50.25,"[-0.48553848415727874, -0.6257227693397263, 0....",1105
8,1.3,8631,4209,4393,48.77,50.9,"[-0.48553848415727874, -0.6257227693397263, 0....",1556
9,1.4,12863,6304,6513,49.01,50.63,"[-0.48553848415727874, -0.6257227693397263, 0....",2028



Overall Best THRESHOLD based on most accepted forecasts:


THRESHOLD                                                                 1.5
Total Accepted Forecasts                                                18490
Positive                                                                 9063
Negative                                                                 9355
P Outcomes                                                              49.02
N Outcomes                                                              50.59
Best Base Vector            [-0.48553848415727874, -0.6257227693397263, 0....
Occurrence Count                                                         2530
Name: 10, dtype: object