In [None]:
import pandas as pd

# Load the uploaded CSV file
file_path = 'refined_candlestick_patterns1.csv'
candlestick_df = pd.read_csv(file_path)

# Display the first few rows of the dataframe to get an overview
candlestick_df.head()


Unnamed: 0,Start,End,Open,High,Low,Close,Volume,Market Cap,Trend,Candlestick Pattern
0,2017-09-07,2017-09-08,4597.12,4655.04,4491.33,4603.03,1932914000.0,75581470000.0,neutral,Unknown
1,2017-09-08,2017-09-09,4599.88,4661.0,4075.18,4224.81,1993160000.0,73748560000.0,neutral,Unknown
2,2017-09-09,2017-09-10,4228.75,4308.82,4114.11,4232.76,2366425000.0,69792800000.0,neutral,DOJI
3,2017-09-10,2017-09-11,4226.06,4245.44,3951.04,4122.58,1517856000.0,68039150000.0,neutral,Unknown
4,2017-09-11,2017-09-12,4122.94,4261.67,4099.4,4163.2,1617782000.0,68983370000.0,neutral,SPINNING TOP


In [None]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# Feature Selection: Remove unnecessary columns ('Start', 'End')
candlestick_df_cleaned = candlestick_df.drop(columns=['Start', 'End'])

# Label Encoding: Encode the 'Candlestick Pattern' column
label_encoder = LabelEncoder()
candlestick_df_cleaned['Candlestick Pattern'] = label_encoder.fit_transform(candlestick_df_cleaned['Candlestick Pattern'])

# Normalization: Apply MinMax scaling to numerical columns
scaler = MinMaxScaler()
numerical_columns = ['Open', 'High', 'Low', 'Close', 'Volume', 'Market Cap']
candlestick_df_cleaned[numerical_columns] = scaler.fit_transform(candlestick_df_cleaned[numerical_columns])

# Splitting the data into features (X) and target labels (y)
X = candlestick_df_cleaned.drop(columns=['Candlestick Pattern'])
y = candlestick_df_cleaned['Candlestick Pattern']

# Display the first few rows of the preprocessed data
candlestick_df_cleaned.head()


Unnamed: 0,Open,High,Low,Close,Volume,Market Cap,Trend,Candlestick Pattern
0,0.020624,0.01969,0.022588,0.020722,0.005448,0.014329,neutral,16
1,0.020663,0.019774,0.016503,0.015314,0.005733,0.012993,neutral,16
2,0.015356,0.014769,0.017072,0.015427,0.007499,0.010111,neutral,3
3,0.015317,0.013868,0.014687,0.013852,0.003485,0.008833,neutral,16
4,0.013843,0.014099,0.016857,0.014433,0.003958,0.009521,neutral,13


In [None]:
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from collections import Counter
# Encode the 'Trend' column using LabelEncoder
candlestick_df_cleaned['Trend'] = label_encoder.fit_transform(candlestick_df_cleaned['Trend'])

# Splitting the data into features (X) and target labels (y) again after encoding 'Trend'
X = candlestick_df_cleaned.drop(columns=['Candlestick Pattern'])
y = candlestick_df_cleaned['Candlestick Pattern']

# Applying SMOTE for oversampling the minority classes
try:
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)

    # Applying Tomek Links for undersampling the majority classes
    tomek = TomekLinks()
    X_resampled, y_resampled = tomek.fit_resample(X_resampled, y_resampled)

    # Check the distribution of the classes after resampling
    resampled_class_distribution = Counter(y_resampled)
    resampled_class_distribution
except ValueError as e:
    str(e)


In [None]:
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from collections import Counter

# Re-initialize LabelEncoder as the previous session state was lost
label_encoder = LabelEncoder()

# Encode the 'Trend' column using LabelEncoder
candlestick_df_cleaned['Trend'] = label_encoder.fit_transform(candlestick_df_cleaned['Trend'])

# Splitting the data into features (X) and target labels (y) again after encoding 'Trend'
X = candlestick_df_cleaned.drop(columns=['Candlestick Pattern'])
y = candlestick_df_cleaned['Candlestick Pattern']

# Applying SMOTE with a reduced n_neighbors parameter for oversampling the minority classes
smote = SMOTE(random_state=42, k_neighbors=2)  # Set n_neighbors to 2 to handle small class sizes
X_resampled, y_resampled = smote.fit_resample(X, y)

# Applying Tomek Links for undersampling the majority classes
tomek = TomekLinks()
X_resampled, y_resampled = tomek.fit_resample(X_resampled, y_resampled)

# Check the distribution of the classes after resampling
resampled_class_distribution = Counter(y_resampled)
resampled_class_distribution



Counter({16: 891,
         3: 944,
         13: 920,
         4: 991,
         1: 973,
         12: 985,
         7: 971,
         0: 999,
         2: 984,
         6: 984,
         8: 984,
         5: 983,
         11: 991,
         15: 997,
         10: 980,
         9: 999,
         14: 999})

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import classification_report, confusion_matrix
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler

# Assuming 'X_resampled' and 'y_resampled' are already preprocessed and available
# Step 1: Scale the features for XGBoost
scaler = StandardScaler()
X_resampled_scaled = scaler.fit_transform(X_resampled)

# Step 2: Define the XGBoost model
def create_xgboost_model(n_estimators=100, max_depth=6, learning_rate=0.1, random_state=42):
    return XGBClassifier(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate,
                         objective='multi:softmax', num_class=len(np.unique(y_resampled)),
                         use_label_encoder=False, eval_metric='mlogloss', random_state=random_state)

# Hyperparameters (can be tuned further if needed)
n_estimators = 100
max_depth = 6
learning_rate = 0.1

# Cross-validation with TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=5)
accuracies = []
confusion_matrices = []
classification_reports = []

for train_index, test_index in tscv.split(X_resampled_scaled):
    X_train, X_test = X_resampled_scaled[train_index], X_resampled_scaled[test_index]
    y_train, y_test = y_resampled[train_index], y_resampled[test_index]

    # Create and train the XGBoost model
    model = create_xgboost_model(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate)
    model.fit(X_train, y_train)

    # Evaluate the model
    y_pred = model.predict(X_test)

    # Calculate metrics
    accuracy = np.mean(y_pred == y_test)
    accuracies.append(accuracy)
    confusion_matrices.append(confusion_matrix(y_test, y_pred))
    classification_reports.append(classification_report(y_test, y_pred, output_dict=True))

# Step 3: Report metrics
average_accuracy = np.mean(accuracies)
print(f'Average Accuracy: {average_accuracy:.4f}')

# Display the last classification report and confusion matrix as an example
last_classification_report = classification_reports[-1]
last_confusion_matrix = confusion_matrices[-1]

print('Classification Report for the last fold:')
print(pd.DataFrame(last_classification_report).transpose())

print('Confusion Matrix for the last fold:')
print(last_confusion_matrix)


Parameters: { "use_label_encoder" } are not used.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Parameters: { "use_label_encoder" } are not used.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Parameters: { "use_label_encoder" } are not used.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Parameters: { "use_label_encoder" } are not used.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Parameters: { "use_label_encoder" } are not used.



Average Accuracy: 0.3473
Classification Report for the last fold:
              precision    recall  f1-score      support
0              0.000000  0.000000  0.000000     0.000000
1              0.000000  0.000000  0.000000     0.000000
2              0.000000  0.000000  0.000000     0.000000
3              0.000000  0.000000  0.000000     0.000000
4              0.000000  0.000000  0.000000     0.000000
5              0.000000  0.000000  0.000000     0.000000
6              0.000000  0.000000  0.000000     0.000000
7              0.000000  0.000000  0.000000     0.000000
8              0.000000  0.000000  0.000000     0.000000
10             0.000000  0.000000  0.000000     0.000000
11             0.000000  0.000000  0.000000     0.000000
12             0.861905  0.770213  0.813483   235.000000
13             0.237374  0.085455  0.125668   550.000000
14             0.996875  0.640562  0.779951   996.000000
15             1.000000  0.399592  0.571012   981.000000
16             0.00000

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
