In [None]:
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)

# Data manipulation
import pandas as pd
import numpy as np

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns
# plt.rcParams['figure.figsize'] = (20,10)
# plt.style.use('fivethirtyeight')

# Preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import (
                                    train_test_split,
                                    RandomizedSearchCV,
                                    TimeSeriesSplit,
                                    cross_val_score
                                    )

# Classifier
from xgboost import XGBClassifier, plot_importance, to_graphviz

# metrics
from sklearn.metrics import (precision_recall_curve,
                             roc_curve,
                             RocCurveDisplay,
                             ConfusionMatrixDisplay
                            )
from sklearn.metrics import (accuracy_score,
                             f1_score,
                             recall_score,
                             precision_score,
                             roc_auc_score,
                             auc
                            )
from sklearn.metrics import (classification_report,
                             confusion_matrix
                            )

In [None]:
# Load file
# file_path = '/Users/roshan/Library/CloudStorage/GoogleDrive-<email>/My Drive/2023/cqf/data/spy.csv'
df = pd.read_csv('data/spy.csv', index_col=0, parse_dates=True)[['Adj Close']]

# Calculate returns
df['Returns'] = np.log(df).diff()
df['Ret_1M'] = df['Returns'].rolling(20).sum()

# Output first five values
df.head()

In [None]:
# Descriptive statistics
df.describe()

In [None]:
# Check for missing values
df.isnull().sum()

In [None]:
# Create features (predictors) list
features_list = []
for r in range(10, 65, 5):
    df['Ret_'+str(r)] = df.Returns.rolling(r).sum()
    df['Std_'+str(r)] = df.Returns.rolling(r).std()
    features_list.append('Ret_'+str(r))
    features_list.append('Std_'+str(r))

# Drop NaN values
df.dropna(inplace=True)

In [None]:
# Define Target
df['Target'] = np.where(df['Adj Close'].shift(-1)>0.995 * df['Adj Close'],1,0)
# df = df[:-1]

# Check output
df

In [None]:
# Convert to NumPy
X = df.drop(['Adj Close', 'Returns', 'Ret_1M', 'Target'],axis=1)
X.values

In [None]:
# Define label or target
y = df['Target']
y

In [None]:
# Splitting the datasets into training and testing data.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)

# Output the train and test data size
print(f"Train and Test Size {len(X_train)}, {len(X_test)}")

In [None]:
# Scale and fit the classifier model
model =  XGBClassifier(verbosity = 0, silent=True, random_state=42)
model.fit(X_train, y_train)

In [None]:
# Predicting the test dataset
y_pred = model.predict(X_test)

# Predict Probabilities
y_proba = model.predict_proba(X_test)

In [None]:
acc_train = accuracy_score(y_train, model.predict(X_train))
acc_test = accuracy_score(y_test, y_pred)

print(f'Train Accuracy: {acc_train:0.4}, Test Accuracy: {acc_test:0.4}')

In [None]:
# Display confussion matrix
disp = ConfusionMatrixDisplay.from_estimator(
        model,
        X_test,
        y_test,
        display_labels=model.classes_,
        cmap=plt.cm.Blues
    )
disp.ax_.set_title('Confusion matrix')
plt.show()

In [None]:
# Classification Report
print(classification_report(y_test, y_pred))

In [None]:
# Display ROCCurve
disp_roc = RocCurveDisplay.from_estimator(
            model,
            X_test,
            y_test,
            name='XGBoost')
disp_roc.ax_.set_title('ROC Curve')
plt.plot([0,1], [0,1], linestyle='--')
plt.show()

In [None]:
# Timeseries CV 3-split
# tscv = TimeSeriesSplit(n_splits=4, gap=1) # sklearn 1.0
# for train, test in tscv.split(X):
#     print(train, test)

In [None]:
# Cross-validation
tscv = TimeSeriesSplit(n_splits=5, gap=1)

In [None]:
# Get params list
model.get_params()

In [None]:
# Hyper parameter optimization
param_grid = {'learning_rate': [0.05, 0.10, 0.15, 0.20, 0.25, 0.30],
              'max_depth': [3, 4, 5, 6, 8, 10, 12, 15],
              'min_child_weight': [1, 3, 5, 7],
              'gamma': [0.0, 0.1, 0.2 , 0.3, 0.4],
              'colsample_bytree': [0.3, 0.4, 0.5 , 0.7]}

In [None]:
# perform random search
rs = RandomizedSearchCV(model, param_grid, n_iter=100, scoring='f1', cv=tscv, verbose=0)
rs.fit(X_train, y_train, verbose=0)

In [None]:
# best parameters
rs.best_params_

In [None]:
# best score
rs.best_score_

In [None]:
# Refit the XGB Classifier with the best params
cls = XGBClassifier(**rs.best_params_)

cls.fit(X_train, y_train,
        eval_set=[(X_train, y_train), (X_test, y_test)],
        # eval_metric='logloss',
        verbose=True)

In [None]:
# Return the evaluation results
# evals_result = cls.evals_result()
# evals_result

In [None]:
# Cross validation score
score = cross_val_score(cls,X_train,y_train,cv=tscv)
print(f'Mean CV Score : {score.mean():0.4}')

In [None]:
# Plot feature importance
fig, ax = plt.subplots(figsize=(10,8))
feature_imp = pd.DataFrame({'Importance Score': cls.feature_importances_,'Features': X.columns}).sort_values(by='Importance Score', ascending=False)

sns.barplot(x=feature_imp['Importance Score'], y=feature_imp['Features'])
ax.set_title('Features Importance');

In [None]:
# The Gain is the most relevant attribute to interpret the relative importance of each feature.
plot_importance?

In [None]:
# feature importance_type = 'gain'
plot_importance(cls, importance_type='weight', show_values=False);

In [None]:
import shap
explainer = shap.TreeExplainer(cls)
shap_values = explainer.shap_values(X_test)

In [None]:
# future importance summary
shap.summary_plot(shap_values, X_test, plot_type="bar")

In [None]:
# interpretation plot
shap.summary_plot(shap_values, X_test)

In [None]:
## Tree Visualization
# change tree number to see the corresponding plot
to_graphviz(cls, num_trees=10, rankdir='UT')

In [None]:
# Predicting the test dataset
y_pred = cls.predict(X_test)

# Measure Accuracy
acc_train = accuracy_score(y_train, cls.predict(X_train))
acc_test = accuracy_score(y_test, y_pred)

# Print Accuracy
print(f'\n Training Accuracy \t: {acc_train :0.4} \n Test Accuracy \t\t: {acc_test :0.4}')

In [None]:
# Display confussion matrix
disp = ConfusionMatrixDisplay.from_estimator(
        cls,
        X_test,
        y_test,
        display_labels=model.classes_,
        cmap=plt.cm.Blues
    )
disp.ax_.set_title('Confusion matrix')
plt.show()

In [None]:
# Classification Report
print(classification_report(y_test, y_pred))

In [None]:
# Display ROCCurve
disp_roc = RocCurveDisplay.from_estimator(
            cls,
            X_test,
            y_test,
            name='Tuned XGBoost')
disp_roc.ax_.set_title('ROC Curve')
plt.plot([0,1], [0,1], linestyle='--')
plt.show()