In [24]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB
from sklearn.metrics import accuracy_score
from sklearn.ensemble import AdaBoostClassifier
import numpy as np 

eth_data = pd.read_csv('../eth.csv')
eth_data

Unnamed: 0,ID,Date,Price (GBP),Market Cap (GBP),Total Volume,% 24hr Change,% 7d Change,Class
0,1,2015-08-07 00:00:00,1.826173,0.000000e+00,5.844408e+04,0.00,0.00,stay the same
1,2,2015-08-08 00:00:00,0.831835,5.021919e+07,2.300759e+05,-54.45,0.00,decrease
2,3,2015-08-10 00:00:00,0.441134,2.666127e+07,2.569256e+05,-46.97,0.00,decrease
3,4,2015-08-11 00:00:00,0.685117,4.142558e+07,9.749977e+05,55.31,0.00,increase
4,5,2015-08-12 00:00:00,0.804621,4.867198e+07,1.327933e+06,17.44,0.00,increase
...,...,...,...,...,...,...,...,...
2874,2876,2023-06-22 00:00:00,1482.229331,1.781982e+11,9.627401e+09,5.68,13.76,increase
2875,2877,2023-06-23 00:00:00,1470.202639,1.767891e+11,7.720735e+09,-0.81,12.91,decrease
2876,2878,2023-06-24 00:00:00,1487.609046,1.788451e+11,7.317404e+09,1.18,11.16,increase
2877,2879,2023-06-25 00:00:00,1474.800287,1.773235e+11,3.845510e+09,-0.86,9.55,decrease


In [25]:
from sklearn.preprocessing import LabelEncoder

# Assuming you have a target label column named 'target' in your DataFrame
label_encoder = LabelEncoder()
eth_data['Class'] = label_encoder.fit_transform(eth_data['Class'])

# Get the mapping between the encoded labels and the original classes
label_mapping = dict(zip(range(len(label_encoder.classes_)), label_encoder.classes_))

# Print the mapping
for encoded_label, original_class in label_mapping.items():
    print(f"Encoded Label: {encoded_label} --> Original Class: {original_class}")
    
eth_data

Encoded Label: 0 --> Original Class: decrease
Encoded Label: 1 --> Original Class: increase
Encoded Label: 2 --> Original Class: stay the same


Unnamed: 0,ID,Date,Price (GBP),Market Cap (GBP),Total Volume,% 24hr Change,% 7d Change,Class
0,1,2015-08-07 00:00:00,1.826173,0.000000e+00,5.844408e+04,0.00,0.00,2
1,2,2015-08-08 00:00:00,0.831835,5.021919e+07,2.300759e+05,-54.45,0.00,0
2,3,2015-08-10 00:00:00,0.441134,2.666127e+07,2.569256e+05,-46.97,0.00,0
3,4,2015-08-11 00:00:00,0.685117,4.142558e+07,9.749977e+05,55.31,0.00,1
4,5,2015-08-12 00:00:00,0.804621,4.867198e+07,1.327933e+06,17.44,0.00,1
...,...,...,...,...,...,...,...,...
2874,2876,2023-06-22 00:00:00,1482.229331,1.781982e+11,9.627401e+09,5.68,13.76,1
2875,2877,2023-06-23 00:00:00,1470.202639,1.767891e+11,7.720735e+09,-0.81,12.91,0
2876,2878,2023-06-24 00:00:00,1487.609046,1.788451e+11,7.317404e+09,1.18,11.16,1
2877,2879,2023-06-25 00:00:00,1474.800287,1.773235e+11,3.845510e+09,-0.86,9.55,0


In [26]:
selected_columns= eth_data.iloc[:, [2, 3, 4, 5, 6, 7]]

# Split the data into features (X) and target label (y)
X= selected_columns.iloc[:, :-1]  # Features (columns 2, 3, 4)
y= selected_columns.iloc[:, -1]   # Target label (column 7)# Standardize the data

In [27]:
selected_columns

Unnamed: 0,Price (GBP),Market Cap (GBP),Total Volume,% 24hr Change,% 7d Change,Class
0,1.826173,0.000000e+00,5.844408e+04,0.00,0.00,2
1,0.831835,5.021919e+07,2.300759e+05,-54.45,0.00,0
2,0.441134,2.666127e+07,2.569256e+05,-46.97,0.00,0
3,0.685117,4.142558e+07,9.749977e+05,55.31,0.00,1
4,0.804621,4.867198e+07,1.327933e+06,17.44,0.00,1
...,...,...,...,...,...,...
2874,1482.229331,1.781982e+11,9.627401e+09,5.68,13.76,1
2875,1470.202639,1.767891e+11,7.720735e+09,-0.81,12.91,0
2876,1487.609046,1.788451e+11,7.317404e+09,1.18,11.16,1
2877,1474.800287,1.773235e+11,3.845510e+09,-0.86,9.55,0


In [28]:
from imblearn.over_sampling import SMOTE

# Create an instance of SMOTE
smote = SMOTE()

# Apply SMOTE to the dataset
X_resampled, y_resampled = smote.fit_resample(X, y)

In [29]:
import pandas as pd

# Count occurrences of each class in y
y_class_counts = pd.Series(y).value_counts()

# Count occurrences of each class in y_resampled
y_resampled_class_counts = pd.Series(y_resampled).value_counts()

# Print the counts
print("Original Class Counts:\n", y_class_counts)
print("\nResampled Class Counts:\n", y_resampled_class_counts)

Original Class Counts:
 Class
1    1462
0    1394
2      23
Name: count, dtype: int64

Resampled Class Counts:
 Class
2    1462
0    1462
1    1462
Name: count, dtype: int64


In [30]:
from sklearn.preprocessing import MinMaxScaler

# Normalize the data
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X_resampled)

In [31]:
#NB MODEL
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y_resampled, test_size=0.3, random_state=42)

In [34]:
from sklearn.model_selection import cross_val_score

# Initialize a Gaussian Naive Bayes classifier
nb_classifier = GaussianNB()

# Train the classifier
nb_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = nb_classifier.predict(X_test)

# Calculate the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Perform cross-validation
cv_scores = cross_val_score(nb_classifier, X_normalized, y_resampled, cv=10)  # 5-fold cross-validation

# Print the cross-validation scores
print("\nCross-Validation Scores:\n", cv_scores)
print("Mean Accuracy:", np.mean(cv_scores))

Accuracy: 0.8411854103343465

Cross-Validation Scores:
 [0.82687927 0.67653759 0.85649203 0.88610478 0.86560364 0.88610478
 0.78082192 0.69863014 0.76484018 0.84474886]
Mean Accuracy: 0.8086763191562392


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, accuracy_score

# Assuming you have predicted labels stored in y_pred and true labels stored in y_test

# Compute the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)

# Create a heatmap for the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title(f'Confusion Matrix (Accuracy: {accuracy:.3f})')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Assuming you have predicted labels stored in y_pred and true labels stored in y_test

# Calculate precision, recall, and F1 score
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print the evaluation metrics
print(f'Precision: {precision:.3f}')
print(f'Recall: {recall:.3f}')
print(f'F1 Score: {f1:.3f}')