After observing the yahoo finance package, found some days' closing price are not correct. So, the extra script help to fix the data. There is no need to download and fix manully everytime. I already email to the yahoo finance to fix this data. If they fix, then there will be no use of this first script in future.


In [1]:
import yfinance as yf
import pandas as pd

def download_and_fix_eurusd_close():
    # Download EUR/USD data
    print("Downloading EUR/USD data...")
    ticker = "EURUSD=X"
    df = yf.download(ticker, start="2000-01-01", progress=False)

    df = df[['Close']].copy()

    # Flatten columns if multi-index (common in yfinance)
    if isinstance(df.columns, pd.MultiIndex):
        print("Flattening columns...")
        df.columns = df.columns.get_level_values(0)

    # Taken from https://www.kaggle.com/datasets/lehenzehra/eurusd-daily-data-ohlc?select=EURUSD_D1_Sorted.csv
    corrections = {
        "2008-01-08": 1.4705,
        "2008-02-08": 1.4503,
        "2008-08-08": 1.5074,
        "2008-09-08": 1.4250,
        "2008-10-08": 1.3650,
        "2008-12-08": 1.2930,
    }

    print("\nApplying corrections...")

    # Apply corrections
    for date_str, price in corrections.items():
        dt = pd.Timestamp(date_str)
        if dt in df.index:
            print(f"Fixing {date_str}: Old={df.at[dt, 'Close']:.4f} -> New={price:.4f}")
            df.at[dt, 'Close'] = price
        else:
            print(f"Warning: {date_str} not found in data.")


    # Save to CSV
    output_file = "EURUSD_Close_Fixed.csv"
    df.to_csv(output_file)
    print(f"\nDone. Saved to {output_file}")


download_and_fix_eurusd_close()

Downloading EUR/USD data...


  df = yf.download(ticker, start="2000-01-01", progress=False)


Flattening columns...

Applying corrections...
Fixing 2008-01-08: Old=1.5571 -> New=1.4705
Fixing 2008-02-08: Old=1.5571 -> New=1.4503
Fixing 2008-08-08: Old=1.5049 -> New=1.5074
Fixing 2008-09-08: Old=1.5050 -> New=1.4250
Fixing 2008-10-08: Old=1.4957 -> New=1.3650
Fixing 2008-12-08: Old=1.4918 -> New=1.2930

Done. Saved to EURUSD_Close_Fixed.csv


Load the corrected data and make a 'difference' percentafge column based on the closing price.

In [2]:
df = pd.read_csv("EURUSD_Close_Fixed.csv", index_col=0, parse_dates=True)
df['difference'] = df['Close'].pct_change() * 100
df.head()
df.to_csv('EURUSD_Close_Fixed_with_difference.csv')


The main script started from here. Creating main data table for calculating Euclidean Distane.

In [3]:
data = df['difference'].dropna().tolist()

main_data_rows = []
window_size = 6

# Iterate through the data to create feature vectors and target values
for i in range(len(data) - window_size + 1):
    row = data[i : i + window_size]
    feature_vector = row[:5]
    true_value = row[5]
    main_data_rows.append(feature_vector + [true_value])

# Create column names for the new DataFrame
column_names = [f'p{j+1}' for j in range(5)] + ['true_value_next_day']

# Create the 'main_data' DataFrame
main_data = pd.DataFrame(main_data_rows, columns=column_names)

# Set the DataFrame index to start from 1
main_data.index = range(1, len(main_data) + 1)

#Saving "Main Data Table"
main_data.to_csv('main_data_table.csv', index_label='Vectors')
print("Saved 'main_data_table.csv'.")

Saved 'main_data_table.csv'.


After forming the "difference" column as vectors, calcultaing the Euclidean Distance. And do statics of the positive and negative outcomes.

In [19]:
import numpy as np
import pandas as pd
from datetime import datetime

#Create base_vector range -2 to 2, each time take 5 datapoints for a vectors.
generated_base_vectors = np.random.uniform(low=-2.0, high=2.0, size=(5, 5))


final_results = []
start_time = datetime.now()
print(f"Started at => {start_time}")
print(f"\n-----Calculating Euclidean Distance-----\n")
print("T: ", end="")
for current_threshold in np.arange(0.5, 1.51, 0.1):
    print(f"--> {current_threshold:.1f}", end="")
    accepted_results_list = []

    # Iterate through each generated base vector
    for gen_base_vec_idx in range(generated_base_vectors.shape[0]):
        current_base_vector = generated_base_vectors[gen_base_vec_idx]
        base_vectors_for_each_threshold = []

        for index, row in main_data.iterrows():
            feature_vector = np.array(row[['p1', 'p2', 'p3', 'p4', 'p5']].tolist())
            euclidean_distance = np.linalg.norm(feature_vector - current_base_vector)
            v_target = 0;

            if euclidean_distance < current_threshold:
                v_target = row['true_value_next_day']
                if current_base_vector.tolist() not in base_vectors_for_each_threshold:
                    base_vectors_for_each_threshold.append(current_base_vector.tolist())

                accepted_results_list.append({
                    'r1': feature_vector.tolist(),
                    'v_target': v_target,
              })

        if accepted_results_list and base_vectors_for_each_threshold: # Check if the list is not empty
            accepted_results_df = pd.DataFrame(accepted_results_list)
            positive_count = (accepted_results_df['v_target'] > 0).sum()
            negative_count = (accepted_results_df['v_target'] <= 0).sum()
            positive_percentage=(positive_count/len(accepted_results_df))*100
            negative_percentage=(negative_count/len(accepted_results_df))*100
            r1_values = accepted_results_df['r1']
            v_target_values = accepted_results_df['v_target']
            
            final_results.append({
                'r1': r1_values.tolist(),
                'v_target': v_target_values.tolist(),
                'B': base_vectors_for_each_threshold,
                'threshold': f"{current_threshold:.1f}",
                '% positive_Vtarget': f"{positive_percentage:.2f}",
                '% negative_Vtarget': f"{negative_percentage:.2f}"
            })

final_result_df = pd.DataFrame(final_results)
print("\nResults Different THRESHOLDS:")
final_result_df.to_csv('final_result_df.csv', index= True)
end_time = datetime.now()
print(f"Finished at => {end_time}")
print(f"Total time take => {end_time - start_time}")

Started at => 2025-12-27 00:19:41.350156

-----Calculating Euclidean Distance-----

T: --> 0.5--> 0.6--> 0.7--> 0.8--> 0.9--> 1.0--> 1.1--> 1.2--> 1.3--> 1.4--> 1.5
Results Different THRESHOLDS:
Finished at => 2025-12-27 00:22:24.305883
Total time take => 0:02:42.955727


In [17]:
import ast
final_result_df['count'] = final_result_df['v_target'].apply(lambda x: len(ast.literal_eval(x)))
final_result_df[['v_target', 'count']].head()

ValueError: malformed node or string: [0.3564305352885766, 0.3739182328805768]