In [1]:
import pandas as pd
import numpy as np
import pywt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

# Load data
data = pd.read_csv('USDCHF_hourly_20.csv')

data

Unnamed: 0.1,Unnamed: 0,vw,o,c,h,l,n,prev_close,daily_return,abs_daily_return,pct_change,direction,Recovery,outlier_id,day type,day,Date,Time
0,0,1.1095,1.111100,1.10928,1.11121,1.107730,7365.0,1.10928,-0.000117,0.000117,0.011719,Down,fast recovery,1,prior day,Tuesday,2010-05-11,00:00:00
1,1,1.1089,1.109280,1.10915,1.10983,1.107420,4139.0,1.10928,-0.000117,0.000117,0.011719,Down,fast recovery,1,prior day,Tuesday,2010-05-11,01:00:00
2,2,1.1093,1.109100,1.10979,1.11040,1.108100,3464.0,1.10915,0.000577,0.000577,0.057702,Up,fast recovery,1,prior day,Tuesday,2010-05-11,02:00:00
3,3,1.1103,1.109790,1.11042,1.11100,1.109630,2906.0,1.10979,0.000568,0.000568,0.056767,Up,fast recovery,1,prior day,Tuesday,2010-05-11,03:00:00
4,4,1.1112,1.110420,1.10990,1.11253,1.109700,5889.0,1.11042,-0.000468,0.000468,0.046829,Down,fast recovery,1,prior day,Tuesday,2010-05-11,04:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53189,53189,0.9258,0.925610,0.92641,0.92675,0.924800,8906.0,0.92560,0.000875,0.000875,0.087511,Up,fast recovery,280,post day,Friday,2023-03-17,19:00:00
53190,53190,0.9263,0.926400,0.92580,0.92740,0.925690,6548.0,0.92641,-0.000658,0.000658,0.065846,Down,fast recovery,280,post day,Friday,2023-03-17,20:00:00
53191,53191,0.9251,0.925900,0.92280,0.92660,0.922755,402.0,0.92580,-0.003240,0.003240,0.324044,Down,fast recovery,280,post day,Friday,2023-03-17,21:00:00
53192,53192,0.9227,0.923000,0.92270,0.92310,0.922344,449.0,0.92280,-0.000108,0.000108,0.010837,Down,fast recovery,280,post day,Friday,2023-03-17,22:00:00


In [2]:
# Select the column to transform
signal = data['daily_return'].dropna()

# Define the wavelet function
wavelet = 'gaus2'
# Define the number of scales
scales = np.logspace(0.1, 1.5, num=20, base=10)  # Logarithmically spaced scales

# Compute the Continuous Wavelet Transform
coefficients, frequencies = pywt.cwt(signal, scales, wavelet)

# For simplicity, we'll use the mean of the coefficients across all scales as a feature
cwt_feature = np.mean(coefficients, axis=0)

# Since the CWT reduces the length of the data due to edge effects, align it:
aligned_cwt_feature = np.full(data.shape[0], np.nan)  # Initialize with NaN
aligned_cwt_feature[:len(cwt_feature)] = cwt_feature  # Fill with CWT results

# Add to data
data['CWT_Mean'] = aligned_cwt_feature

In [3]:
# Drop unnecessary columns and handle any missing values
data.drop(columns=['Unnamed: 0', 'Date', 'Time', 'day'], inplace=True)

# Convert categorical data to numeric
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

# Separate features and target
X = data.drop('direction', axis=1)
y = data['direction']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
predictions = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, predictions))

# Feature importance
importances = model.feature_importances_
feature_names = X.columns
feature_importance = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importance.sort_values(by='Importance', ascending=False, inplace=True)
print(feature_importance)

Accuracy: 0.9977441488861735
             Feature  Importance
7       daily_return    0.949495
13          CWT_Mean    0.007498
9         pct_change    0.006087
8   abs_daily_return    0.005852
2                  c    0.005736
6         prev_close    0.005590
5                  n    0.005023
1                  o    0.004394
4                  l    0.003023
0                 vw    0.002970
3                  h    0.002774
11        outlier_id    0.001385
12          day type    0.000173
10          Recovery    0.000000
