In [30]:
# Procesamiento de datos
import pandas as pd
# Funciones algebráicas
import numpy as np
# Visualización
import matplotlib.pyplot as plt
import seaborn as sns
## Estilos
plt.style.use('fivethirtyeight')
sns.set_style('whitegrid')
# Contador
from collections import Counter
# Métodos de iteración
from itertools import product
# Machine Learning
## Cross Validation
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
## Separación de datos de entrenamiento
from sklearn.model_selection import train_test_split
## Preprocesamiento de variables
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OrdinalEncoder, RobustScaler, MaxAbsScaler, QuantileTransformer, PowerTransformer
# Modelos de clasificacion
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

## Modelo de máquina de soporte vectorial
from sklearn.svm import SVC
## Clasificador de árbol de decisión
from sklearn.tree import DecisionTreeClassifier
## Clasificador de bosques aleatorios
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
## Naive Bayes
from sklearn.naive_bayes import GaussianNB
## Regresión Logística
from sklearn.linear_model import LogisticRegression
## Muestreado para balancear datos
from imblearn.over_sampling import RandomOverSampler, SMOTE, SMOTENC
## Métricas
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_curve, auc, roc_auc_score, ConfusionMatrixDisplay, f1_score, make_scorer, precision_score, recall_score
## Ajuste de hiperparámetros
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
# módulo que proporciona soporte en tiempo de ejecución para sugerencias de tipo
from sklearn import datasets
from typing import Dict, Iterable, Any
# Desactivar los avisos
import warnings
warnings.filterwarnings('ignore')
# Importar los datos de yahoo
import yfinance as yf
# Librería para plotear mejor datos financieros
#import cufflinks as cf
#cf.set_config_file(offline = True)

He intentado escoger acciones con poca correlación. La explicación de ellas es:

AAPL: Apple Inc., uno de los fabricantes más grandes de tecnología de consumo, conocido por productos como el iPhone, el iPad y las MacBooks.

RELIANCE.NS: Reliance Industries Limited, una conglomerado multinacional indio que tiene negocios en diversos sectores como energía, petroquímicos, textiles, recursos naturales, y telecomunicaciones. El sufijo ".NS" indica que la acción se negocia en la National Stock Exchange of India.

BHP.AX: BHP Group Limited (anteriormente BHP Billiton), una de las compañías mineras y de recursos más grandes del mundo. El sufijo ".AX" indica que la acción se negocia en la Australian Securities Exchange.

SAN.MC: Banco Santander, S.A., un banco multinacional español y una de las mayores instituciones bancarias de la zona euro. El sufijo ".MC" indica que la acción se negocia en la Bolsa de Madrid.

REP.MC: Repsol, S.A., una empresa energética y petroquímica española. Al igual que SAN.MC, el sufijo ".MC" indica que la acción se negocia en la Bolsa de Madrid.

IBE.MC: Iberdrola, S.A., una empresa española de energía, una de las mayores productoras de energía renovable. También cotiza en la Bolsa de Madrid.

TEF.MC: Telefónica, S.A., una compañía multinacional de telecomunicaciones con sede en España. También cotiza en la Bolsa de Madrid.

Acciones con las que trabajaré: 

["AAPL", "RELIANCE.NS", "BHP.AX", "SAN.MC", "REP.MC", "IBE.MC", "TEF.MC"]

In [31]:
df = pd.read_csv("acciones_preprocesado.csv")
df

Unnamed: 0,accion,date,high,low,price,returns,direction,lag1,lag2,lag3,lag4,lag5,sma200,sma_lower200,sma_upper200,ema_s,ema_l,roll_low,roll_high
0,AAPL,2000-10-24,0.372768,0.335938,0.285350,-0.076468,-1.0,0.043894,0.029268,-0.060818,0.000000,-0.066091,0.786866,0.439835,1.133898,0.331946,0.444854,0.327009,0.437500
1,AAPL,2000-10-25,0.342634,0.329241,0.279680,-0.020069,-1.0,-0.076468,0.043894,0.029268,-0.060818,0.000000,0.784759,0.430590,1.138928,0.323905,0.432619,0.327009,0.415179
2,AAPL,2000-10-26,0.337054,0.312500,0.279680,0.000000,0.0,-0.020069,-0.076468,0.043894,0.029268,-0.060818,0.782862,0.421987,1.143738,0.317101,0.421290,0.312500,0.415179
3,AAPL,2000-10-27,0.342634,0.319196,0.280625,0.003372,1.0,0.000000,-0.020069,-0.076468,0.043894,0.029268,0.780609,0.412876,1.148342,0.311489,0.410870,0.312500,0.415179
4,AAPL,2000-10-30,0.356027,0.334821,0.291963,0.039610,1.0,0.003372,0.000000,-0.020069,-0.076468,0.043894,0.778273,0.404112,1.152433,0.308485,0.402062,0.312500,0.415179
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41652,TEF.MC,2024-02-23,3.776000,3.710000,3.776000,0.008778,1.0,0.015888,0.004625,0.011244,0.018089,-0.015880,3.636597,3.396894,3.876300,3.671950,3.676017,3.541000,3.792000
41653,TEF.MC,2024-02-26,3.794000,3.748000,3.776000,0.000000,0.0,0.008778,0.015888,0.004625,0.011244,0.018089,3.637271,3.396759,3.877783,3.687958,3.683423,3.541000,3.794000
41654,TEF.MC,2024-02-27,3.815000,3.751000,3.810000,0.008964,1.0,0.000000,0.008778,0.015888,0.004625,0.011244,3.638124,3.396375,3.879873,3.706734,3.692799,3.541000,3.815000
41655,TEF.MC,2024-02-28,3.874000,3.814000,3.828000,0.004713,1.0,0.008964,0.000000,0.008778,0.015888,0.004625,3.639109,3.395876,3.882342,3.725390,3.702814,3.541000,3.874000


Lo primero que voy a hacer es transformar la fecha para poder meterla como variable numérica 

In [32]:
df.date = pd.to_datetime(df.date)

# Extraer componentes de la fecha
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['day_of_week'] = df['date'].dt.dayofweek  # 0 es lunes, 6 es domingo
df['day_of_year'] = df['date'].dt.dayofyear
df['week_of_year'] = df['date'].dt.isocalendar().week

# Aplicar codificación cíclica para mes y día de la semana
df['sin_month'] = np.sin(2 * np.pi * df['month']/12)
df['cos_month'] = np.cos(2 * np.pi * df['month']/12)
df['sin_day'] = np.sin(2 * np.pi * df['day']/31)
df['cos_day'] = np.cos(2 * np.pi * df['day']/31)
df['sin_day_of_week'] = np.sin(2 * np.pi * df['day_of_week']/7)
df['cos_day_of_week'] = np.cos(2 * np.pi * df['day_of_week']/7)

# elimino la columna date porque ya la he procesado y la fecha como tal no se puede meter en el modelo
# elimino las otras variables que he transformado con seno y coseno para el tema de los ciclos naturales 
df.drop(["date", "month", "day", "day_of_week"], axis = 1, inplace = True)

df

Unnamed: 0,accion,high,low,price,returns,direction,lag1,lag2,lag3,lag4,...,roll_high,year,day_of_year,week_of_year,sin_month,cos_month,sin_day,cos_day,sin_day_of_week,cos_day_of_week
0,AAPL,0.372768,0.335938,0.285350,-0.076468,-1.0,0.043894,0.029268,-0.060818,0.000000,...,0.437500,2000,298,43,-0.866025,0.5,-0.988468,0.151428,0.781831,0.623490
1,AAPL,0.342634,0.329241,0.279680,-0.020069,-1.0,-0.076468,0.043894,0.029268,-0.060818,...,0.415179,2000,299,43,-0.866025,0.5,-0.937752,0.347305,0.974928,-0.222521
2,AAPL,0.337054,0.312500,0.279680,0.000000,0.0,-0.020069,-0.076468,0.043894,0.029268,...,0.415179,2000,300,43,-0.866025,0.5,-0.848644,0.528964,0.433884,-0.900969
3,AAPL,0.342634,0.319196,0.280625,0.003372,1.0,0.000000,-0.020069,-0.076468,0.043894,...,0.415179,2000,301,43,-0.866025,0.5,-0.724793,0.688967,-0.433884,-0.900969
4,AAPL,0.356027,0.334821,0.291963,0.039610,1.0,0.003372,0.000000,-0.020069,-0.076468,...,0.415179,2000,304,44,-0.866025,0.5,-0.201299,0.979530,0.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41652,TEF.MC,3.776000,3.710000,3.776000,0.008778,1.0,0.015888,0.004625,0.011244,0.018089,...,3.792000,2024,54,8,0.866025,0.5,-0.998717,-0.050649,-0.433884,-0.900969
41653,TEF.MC,3.794000,3.748000,3.776000,0.000000,0.0,0.008778,0.015888,0.004625,0.011244,...,3.794000,2024,57,9,0.866025,0.5,-0.848644,0.528964,0.000000,1.000000
41654,TEF.MC,3.815000,3.751000,3.810000,0.008964,1.0,0.000000,0.008778,0.015888,0.004625,...,3.815000,2024,58,9,0.866025,0.5,-0.724793,0.688967,0.781831,0.623490
41655,TEF.MC,3.874000,3.814000,3.828000,0.004713,1.0,0.008964,0.000000,0.008778,0.015888,...,3.874000,2024,59,9,0.866025,0.5,-0.571268,0.820763,0.974928,-0.222521


Ahora voy a separar los data frames en:
1.- variables categóricas para hacerle onehotencoding
2.- variables numéricas para aplicarle escaladores
3.- variables seno y cosenoidales que no se transforman ya más
4.- variable objetivo (dirección)

In [33]:
df_cat = df[["accion"]].copy()
columns_to_copy = [col for col in df.columns if col.startswith('sin') or col.startswith('cos')]
df_sincos = df[columns_to_copy].copy()
columns_to_exclude = columns_to_copy + ["accion", "direction"]
columns_to_select = [col for col in df.columns if col not in columns_to_exclude]
df_num = df[columns_to_select].copy()
df_dir = df[["direction"]].copy()

Empiezo haciendo one hot encoding sobre la única variable tipo texto que tengo. El nombre de la acción

In [34]:
df_num.describe()

Unnamed: 0,high,low,price,returns,lag1,lag2,lag3,lag4,lag5,sma200,sma_lower200,sma_upper200,ema_s,ema_l,roll_low,roll_high,year,day_of_year,week_of_year
count,41657.0,41657.0,41657.0,41657.0,41657.0,41657.0,41657.0,41657.0,41657.0,41657.0,41657.0,41657.0,41657.0,41657.0,41657.0,41657.0,41657.0,41657.0,41657.0
mean,113.817012,111.198921,104.566621,0.000393,0.000392,0.000391,0.000387,0.000389,0.000385,97.948491,80.350612,115.54637,104.152315,103.639831,106.274867,117.712044,2011.936721,183.459179,26.623833
std,364.780828,357.03522,357.583899,0.020363,0.020374,0.020379,0.020384,0.020385,0.020388,335.066304,284.553453,388.530257,355.941076,353.971996,342.35445,375.699867,6.769532,105.731326,15.101813
min,0.235536,0.227143,0.198346,-0.77629,-0.77629,-0.77629,-0.77629,-0.77629,-0.77629,0.222776,-0.042618,0.243699,0.205185,0.209982,0.227143,0.257857,2000.0,1.0,1.0
25%,6.572,6.439327,3.534887,-0.009074,-0.009074,-0.009074,-0.009079,-0.009079,-0.009081,3.459248,2.76579,4.141229,3.535472,3.542249,6.16538,6.779,2006.0,91.0,13.0
50%,13.03072,12.763778,6.381726,0.000345,0.000345,0.000345,0.000342,0.000345,0.000338,6.360646,5.090845,7.711815,6.370496,6.365317,12.339062,13.45,2012.0,184.0,27.0
75%,29.645367,29.195,15.948939,0.009999,0.010004,0.010013,0.010013,0.010017,0.010017,15.281023,12.882795,17.797632,15.912398,15.826852,28.056183,30.638929,2018.0,276.0,40.0
max,2999.899902,2966.699951,2987.25,0.782708,0.782708,0.782708,0.782708,0.782708,0.782708,2486.273513,2161.673174,2885.638175,2943.460843,2884.92886,2884.699951,2999.899902,2024.0,366.0,53.0


In [35]:
# drop_first para evitar la colinealidad
df_cat = pd.get_dummies(df_cat, columns=["accion"], drop_first = True).astype(int)
df_cat 


Unnamed: 0,accion_BHP.AX,accion_IBE.MC,accion_RELIANCE.NS,accion_REP.MC,accion_SAN.MC,accion_TEF.MC
0,0,0,0,0,0,0
1,0,0,0,0,0,0
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,0,0,0,0,0,0
...,...,...,...,...,...,...
41652,0,0,0,0,0,1
41653,0,0,0,0,0,1
41654,0,0,0,0,0,1
41655,0,0,0,0,0,1


Ahora aplico escaladores sobre las variables numéricas

In [36]:
# scaler = MinMaxScaler()
# scaled_data = scaler.fit_transform(df_num)

# # Convertir de nuevo a DataFrame si es necesario, manteniendo los mismos nombres de columnas
# df_num = pd.DataFrame(scaled_data, columns=df_num.columns)
# plt.figure(figsize=(10, 6))  # Ajusta el tamaño del gráfico a tu preferencia

# # Boxplot de todos los datos escalados
# sns.boxplot(data=df_num)

# # Dar título al gráfico
# plt.title('Boxplot of Scaled Features')

# # Mejorar los ticks del eje x para evitar la superposición
# plt.xticks(rotation=90)

# plt.show()


In [37]:
# # Histogramas de todos los datos escalados
# df_num.hist(figsize=(20, 15), bins=50, layout=(7, 4))
# plt.show()
# # Me parece rara la distribucion de price, low y high...

Ahora ya tengo el df de categóricas con onehotencoding y las numéricas con los datos escalados. Por tanto los quiero volver a unir y juntar también con las variables seno y cosenoidales. Y tener solo separada la variable objetivo

In [38]:
# por otra parte me guardo la variable objetivo en este dataframe y veo que está balanceado pero que tiene algunos 0, 
# no se bien que hacer con ellos. Significa que la diferencia de precio de un día a otro ha sido 0
df_dir.direction.value_counts()

direction
 1.0    21028
-1.0    19772
 0.0      857
Name: count, dtype: int64

Ahora vamos a entrenar un modelo solo para ver que tal funciona, si me das el ok Alejandro intento meterle un bucle para que pruebe distintos modelos y distintos escaladores. De momento hago la prueba con logistic regression.

In [39]:
df_num[:-1000]

Unnamed: 0,high,low,price,returns,lag1,lag2,lag3,lag4,lag5,sma200,sma_lower200,sma_upper200,ema_s,ema_l,roll_low,roll_high,year,day_of_year,week_of_year
0,0.372768,0.335938,0.285350,-0.076468,0.043894,0.029268,-0.060818,0.000000,-0.066091,0.786866,0.439835,1.133898,0.331946,0.444854,0.327009,0.437500,2000,298,43
1,0.342634,0.329241,0.279680,-0.020069,-0.076468,0.043894,0.029268,-0.060818,0.000000,0.784759,0.430590,1.138928,0.323905,0.432619,0.327009,0.415179,2000,299,43
2,0.337054,0.312500,0.279680,0.000000,-0.020069,-0.076468,0.043894,0.029268,-0.060818,0.782862,0.421987,1.143738,0.317101,0.421290,0.312500,0.415179,2000,300,43
3,0.342634,0.319196,0.280625,0.003372,0.000000,-0.020069,-0.076468,0.043894,0.029268,0.780609,0.412876,1.148342,0.311489,0.410870,0.312500,0.415179,2000,301,43
4,0.356027,0.334821,0.291963,0.039610,0.003372,0.000000,-0.020069,-0.076468,0.043894,0.778273,0.404112,1.152433,0.308485,0.402062,0.312500,0.415179,2000,304,44
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40652,4.198000,4.051000,2.990779,0.008543,0.002177,-0.035100,-0.039769,-0.014058,0.067127,4.504122,3.439726,5.568518,3.101504,3.355200,3.533000,4.850000,2020,91,14
40653,4.162000,4.002000,2.880779,-0.037473,0.008543,0.002177,-0.035100,-0.039769,-0.014058,4.493279,3.407250,5.579309,3.067546,3.320057,3.533000,4.850000,2020,92,14
40654,4.094000,3.872000,2.915893,0.012115,-0.037473,0.008543,0.002177,-0.035100,-0.039769,4.482543,3.376884,5.588201,3.044215,3.290119,3.533000,4.850000,2020,93,14
40655,4.078000,3.980000,2.898694,-0.005916,0.012115,-0.037473,0.008543,0.002177,-0.035100,4.471654,3.346782,5.596526,3.021827,3.261125,3.652500,4.850000,2020,94,14


In [40]:
scalers = {
    "MinMax Scaler": MinMaxScaler(),
    "Standard Scaler": StandardScaler(),
    "MaxAbs Scaler": MaxAbsScaler(),
    "Robust Scaler": RobustScaler(),
    "Quant-Normal": QuantileTransformer(output_distribution="normal"),
    "Quant-Uniform": QuantileTransformer(output_distribution="uniform"),
    "PowerTransf-YeoJohnson": PowerTransformer(method='yeo-johnson')
}


# Define classification models
models = {
    "SVC": SVC(gamma="auto", C=10000),
    "Logistic Regression": LogisticRegression(penalty="l2"),
    "KNeighbors Classifier": KNeighborsClassifier(n_neighbors=5),
    "Decision Tree Classifier": DecisionTreeClassifier(),
    "GaussianNB": GaussianNB(),
    "Random Forest Classifier": RandomForestClassifier(n_estimators=100),
    "MLP": MLPClassifier(tol=0.05, max_iter=1000),
    "Gaussian Process Classifier": GaussianProcessClassifier(), # tarda mucho este? comprobar
    "Ada Boost Classifier": AdaBoostClassifier(n_estimators=100),
    "Bagging Classifier": BaggingClassifier(estimator=SVC(), n_estimators=10)
}

In [41]:

# Configurar el método de validación cruzada
cv = StratifiedKFold(n_splits=5)

# Diccionario de métricas de scoring
scoring = {'accuracy': 'accuracy'}

# DataFrame para guardar los resultados finales
results = pd.DataFrame()
df_dir = df_dir[-5000:]
# Bucle para cada escalador
for scaler_name, scaler in scalers.items():
    # Aplicar el escalador a los datos
    X_scaled = pd.DataFrame(scaler.fit_transform(df_num.copy()), columns = df_num.columns)  
    df_variables = pd.concat([df_cat, df_sincos, X_scaled], axis=1)
    df_variables = df_variables[-5000:]
    for model_name, model in models.items():
        # Ejecutar cross_validate
        cv_results = cross_validate(model, df_variables, df_dir, cv=cv, scoring=scoring)
        # Calcular la media de las precisiones de validación cruzada
        mean_accuracy = cv_results['test_accuracy'].mean()
        # Guardar los resultados en el DataFrame
        temp_df = pd.DataFrame({
            'Scaler': [scaler_name],
            'Model': [model_name],
            'Accuracy': [mean_accuracy]
        })
        # Concatenar el resultado al DataFrame de resultados
        results = pd.concat([results, temp_df], ignore_index=True)
        print(scaler_name, ", ", model_name, ": ", mean_accuracy)

# Reorganizar el DataFrame para una mejor visualización
results = results.pivot(index='Scaler', columns='Model', values='Accuracy')
results.index.name = None
results.columns.name = None
results.style.highlight_max(color='green', axis=0)


MinMax Scaler ,  SVC :  0.8960000000000001
MinMax Scaler ,  Logistic Regression :  0.8234
MinMax Scaler ,  KNeighbors Classifier :  0.4838
MinMax Scaler ,  Decision Tree Classifier :  0.9960000000000001
MinMax Scaler ,  GaussianNB :  0.9256
MinMax Scaler ,  Random Forest Classifier :  0.827
MinMax Scaler ,  MLP :  0.492
MinMax Scaler ,  Gaussian Process Classifier :  0.5067999999999999
MinMax Scaler ,  Ada Boost Classifier :  0.9984
MinMax Scaler ,  Bagging Classifier :  0.49960000000000004
Standard Scaler ,  SVC :  0.9543999999999999
Standard Scaler ,  Logistic Regression :  0.9795999999999999
Standard Scaler ,  KNeighbors Classifier :  0.7830000000000001
Standard Scaler ,  Decision Tree Classifier :  1.0
Standard Scaler ,  GaussianNB :  0.9286
Standard Scaler ,  Random Forest Classifier :  0.8298
Standard Scaler ,  MLP :  0.9722
Standard Scaler ,  Gaussian Process Classifier :  0.9232000000000001
Standard Scaler ,  Ada Boost Classifier :  1.0
Standard Scaler ,  Bagging Classifier :  

Unnamed: 0,Ada Boost Classifier,Bagging Classifier,Decision Tree Classifier,Gaussian Process Classifier,GaussianNB,KNeighbors Classifier,Logistic Regression,MLP,Random Forest Classifier,SVC
MaxAbs Scaler,1.0,0.5438,1.0,0.5574,0.9264,0.5234,0.946,0.611,0.8364,0.9666
MinMax Scaler,0.9984,0.4996,0.996,0.5068,0.9256,0.4838,0.8234,0.492,0.827,0.896
PowerTransf-YeoJohnson,1.0,0.9544,1.0,0.913,0.93,0.7674,0.9536,0.9744,0.8382,0.9444
Quant-Normal,1.0,0.9604,1.0,0.9346,0.9858,0.8304,0.9712,0.9684,0.8318,0.955
Quant-Uniform,1.0,0.9684,1.0,0.9618,0.9944,0.7618,0.974,0.957,0.8584,0.9748
Robust Scaler,1.0,0.9604,1.0,0.9356,0.9288,0.8054,0.9672,0.9716,0.8372,0.958
Standard Scaler,1.0,0.955,1.0,0.9232,0.9286,0.783,0.9796,0.9722,0.8298,0.9544
