After observing the yahoo finance package, found some days' closing price are not correct. So, the extra script help to fix the data. There is no need to download and fix manully everytime. I already email to the yahoo finance to fix this data. If they fix, then there will be no use of this first script in future.


In [1]:
import yfinance as yf
import pandas as pd

def download_and_fix_eurusd_close():
    # Download EUR/USD data
    print("Downloading EUR/USD data...")
    ticker = "EURUSD=X"
    df = yf.download(ticker, start="2000-01-01", end="2023-12-31", progress=False)

    df = df[['Close']].copy()

    # Flatten columns if multi-index (common in yfinance)
    if isinstance(df.columns, pd.MultiIndex):
        print("Flattening columns...")
        df.columns = df.columns.get_level_values(0)

    # Taken from https://www.kaggle.com/datasets/lehenzehra/eurusd-daily-data-ohlc?select=EURUSD_D1_Sorted.csv
    corrections = {
        "2008-01-08": 1.4705,
        "2008-02-08": 1.4503,
        "2008-08-08": 1.5074,
        "2008-09-08": 1.4250,
        "2008-10-08": 1.3650,
        "2008-12-08": 1.2930,
    }

    print("\nApplying corrections...")

    # Apply corrections
    for date_str, price in corrections.items():
        dt = pd.Timestamp(date_str)
        if dt in df.index:
            print(f"Fixing {date_str}: Old={df.at[dt, 'Close']:.4f} -> New={price:.4f}")
            df.at[dt, 'Close'] = price
        else:
            print(f"Warning: {date_str} not found in data.")


    # Save to CSV
    output_file = "../results/EURUSD_Close_Fixed_2023.csv"
    df.to_csv(output_file)
    print(f"\nDone. Saved to {output_file}")


download_and_fix_eurusd_close()

Downloading EUR/USD data...
Flattening columns...

Applying corrections...
Fixing 2008-01-08: Old=1.5571 -> New=1.4705
Fixing 2008-02-08: Old=1.5571 -> New=1.4503
Fixing 2008-08-08: Old=1.5049 -> New=1.5074
Fixing 2008-09-08: Old=1.5050 -> New=1.4250
Fixing 2008-10-08: Old=1.4957 -> New=1.3650
Fixing 2008-12-08: Old=1.4918 -> New=1.2930

Done. Saved to ../results/EURUSD_Close_Fixed_2023.csv


Load the corrected data and make a 'difference' percentafge column based on the closing price.

In [2]:
df = pd.read_csv("../results/EURUSD_Close_Fixed_2023.csv", index_col=0, parse_dates=True)
df['difference'] = df['Close'].pct_change() * 100
df.head()
df.to_csv('../results/EURUSD_Close_Fixed_with_difference_2023.csv')


The main script started from here. Creating main data table for calculating Euclidean Distane.

In [3]:
data = df['difference'].dropna().tolist()

main_data_rows = []
window_size = 6

# Iterate through the data to create feature vectors and target values
for i in range(len(data) - window_size + 1):
    row = data[i : i + window_size]
    feature_vector = row[:5]
    true_value = row[5]
    main_data_rows.append(feature_vector + [true_value])

# Create column names for the new DataFrame
column_names = [f'p{j+1}' for j in range(5)] + ['true_value_next_day']

# Create the 'main_data' DataFrame
main_data = pd.DataFrame(main_data_rows, columns=column_names)

# Set the DataFrame index to start from 1
main_data.index = range(1, len(main_data) + 1)

#Saving "Main Data Table"
main_data.to_csv('../results/main_data_table_2023.csv', index_label='Vectors')
print("Saved 'main_data_table_2023.csv'.")

Saved 'main_data_table_2023.csv'.


After forming the "difference" column as vectors, calcultaing the Euclidean Distance. And do statics of the positive and negative outcomes.

In [4]:
def writeInText(accepted_results_df, B, threshold, positive_percentage, negative_percentage):
  with open('../results/final_result_df_2023.txt', 'a') as f:
    f.write("L set:\n")
    for index, row in accepted_results_df.iterrows():
        # Assuming column 0 is the feature vector list and column 1 is v_target
        feature_vector_str = ', '.join(map(str, row[0]))
        v_target_val = row[1]
        f.write(f"[{feature_vector_str}, {v_target_val}]\n")

    f.write("B vector:\n")
    # Explicitly format B as comma-separated string without spaces
    f.write(f"[{','.join(map(str, B))}]")

    f.write("\nτ value:\n")
    f.write(f"{threshold:.1f}")

    f.write("\n% of positive vtarget:\n")
    f.write(f"{positive_percentage:.2f}")

    f.write("\n% of negative vtarget:\n")
    f.write(f"{negative_percentage:.2f}")

    f.write("\n\n-------------------------------\n\n")
print("Formatted results will be saved to 'final_result_df.txt' upon execution.")

Formatted results will be saved to 'final_result_df.txt' upon execution.


In [5]:
import numpy as np
import pandas as pd
from datetime import datetime
import os

# Define the filename for storing generated base vectors
base_vectors_file = '../results/base_vectors/generated_base_vectors.txt'

# Ensure generated_base_vectors is created only once per kernel session and persisted
if os.path.exists(base_vectors_file):
    print(f"Loading generated base vectors from {base_vectors_file}")
    generated_base_vectors = np.loadtxt(base_vectors_file)
    # Reshape if only a single vector was saved (loadtxt might flatten it)
    if generated_base_vectors.ndim == 1:
        generated_base_vectors = generated_base_vectors.reshape(1, -1)
else:
    print("Generating new base vectors and saving them.")
    generated_base_vectors = np.random.uniform(low=-2.0, high=2.0, size=(50, 5))
    np.savetxt(base_vectors_file, generated_base_vectors)


start_time = datetime.now()
print(f"Started at => {start_time}")
print(f"\n-----Calculating Euclidean Distance-----\n")
print("T: ", end="")

# Clear the file before starting to avoid appending to previous runs
with open('../results/final_result_df_2023.txt', 'w') as f:
    pass

for current_threshold in np.arange(0.5, 1.51, 0.1):
    print(f"--> {current_threshold:.1f}", end="")

    # Iterate through each generated base vector
    for gen_base_vec_idx in range(generated_base_vectors.shape[0]):
        current_base_vector = generated_base_vectors[gen_base_vec_idx]
        accepted_results_list = [] # Reset for each base vector within a threshold

        for index, row in main_data.iterrows():
            feature_vector = np.array(row[['p1', 'p2', 'p3', 'p4', 'p5']].tolist())
            euclidean_distance = np.linalg.norm(feature_vector - current_base_vector)
            v_target = 0;

            if euclidean_distance < current_threshold:
                v_target = row['true_value_next_day']

                # Modified: Append a list [feature_vector, v_target]
                accepted_results_list.append([feature_vector.tolist(), v_target])

        if accepted_results_list:
            # Modified: Create DataFrame with explicit column names
            accepted_results_df_without_col =  pd.DataFrame(accepted_results_list)

            accepted_results_df = pd.DataFrame(accepted_results_list, columns=['r1', 'v_target'])
            positive_count = (accepted_results_df['v_target'] > 0).sum()
            negative_count = (accepted_results_df['v_target'] <= 0).sum()
            positive_percentage=(positive_count/len(accepted_results_df))*100
            negative_percentage=(negative_count/len(accepted_results_df))*100

            writeInText(accepted_results_df_without_col, current_base_vector.tolist(), current_threshold, positive_percentage, negative_percentage)

end_time = datetime.now()
print(f"\nFinished at => {end_time}")
print(f"Total time take => {end_time - start_time}")

Loading generated base vectors from ../results/base_vectors/generated_base_vectors.txt
Started at => 2026-02-16 12:55:18.590672

-----Calculating Euclidean Distance-----

T: --> 0.5--> 0.6--> 0.7--> 0.8--> 0.9--> 1.0--> 1.1--> 1.2--> 1.3--> 1.4--> 1.5
Finished at => 2026-02-16 13:30:14.132168
Total time take => 0:34:55.541496


In [6]:
import pandas as pd

def parse_full_info_sorted(file_path):
    with open(file_path, 'r') as f:
        content = f.read()

    # Split content by the separator line
    blocks = content.split('-------------------------------')
    
    data = []

    for block in blocks:
        if not block.strip():
            continue
        
        lines = [line.strip() for line in block.strip().split('\n')]
        
        record = {
            'L_set_count': 0,
            'threshold_value': None,
            'B_vector': None,
            'percent_positive': None,
            'percent_negative': None
        }
        
        # Extract Threshold (τ value)
        if 'τ value:' in lines:
            idx = lines.index('τ value:')
            if idx + 1 < len(lines):
                record['threshold_value'] = lines[idx+1]
        
        # Extract B Vector (Full String)
        if 'B vector:' in lines:
            idx = lines.index('B vector:')
            if idx + 1 < len(lines):
                record['B_vector'] = lines[idx+1]

        # Extract Percentages
        if '% of positive vtarget:' in lines:
            idx = lines.index('% of positive vtarget:')
            if idx + 1 < len(lines):
                record['percent_positive'] = lines[idx+1]
                
        if '% of negative vtarget:' in lines:
            idx = lines.index('% of negative vtarget:')
            if idx + 1 < len(lines):
                record['percent_negative'] = lines[idx+1]

        # Count L Set Vectors
        if 'L set:' in lines and 'B vector:' in lines:
            try:
                l_start = lines.index('L set:')
                b_start = lines.index('B vector:')
                
                count = 0
                # Count lines starting with '[' between labels
                for i in range(l_start + 1, b_start):
                    if lines[i].startswith('['):
                        count += 1
                record['L_set_count'] = count
            except ValueError:
                pass

        # Only add records with valid B_vector
        if record['B_vector'] is not None:
            data.append(record)

    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Sort by count descending
    if not df.empty:
        df_sorted = df.sort_values(by='L_set_count', ascending=False)
        return df_sorted
    return df

# Configure Pandas to show full column width
pd.set_option('display.max_colwidth', None)

# Usage
file_name = '../results/final_result_df_2023.txt'
df_final = parse_full_info_sorted(file_name)

# Select desired columns
columns_to_show = ['L_set_count', 'threshold_value', 'B_vector', 'percent_positive', 'percent_negative']
print(df_final[columns_to_show].head())

# Save to CSV
df_final[columns_to_show].to_csv('../results/sorted_l_set_full_info.csv', index=False)

     L_set_count threshold_value  \
359         2862             1.5   
310         2425             1.4   
264         1975             1.3   
218         1518             1.2   
366         1371             1.5   

                                                                                                 B_vector  \
359   [0.6673177879279355,-0.399743965085662,0.1166337339292971,-0.4774067929209176,-0.38544785593128283]   
310   [0.6673177879279355,-0.399743965085662,0.1166337339292971,-0.4774067929209176,-0.38544785593128283]   
264   [0.6673177879279355,-0.399743965085662,0.1166337339292971,-0.4774067929209176,-0.38544785593128283]   
218   [0.6673177879279355,-0.399743965085662,0.1166337339292971,-0.4774067929209176,-0.38544785593128283]   
366  [-1.1523272049025688,0.5411143749381351,0.1461269041420521,-0.06325676014369375,-0.6721807878932626]   

    percent_positive percent_negative  
359            50.98            49.02  
310            51.51            48.49  
264     