In [1]:
import pandas as pd
import os
import numpy as np

In [2]:
start_date = '2014-03-01'
end_date = '2025-03-10'
data_path = os.path.join('..', 'data', f'nvidia_data_{start_date}_{end_date}')
nvda_df = pd.read_csv(data_path)

In [5]:
class DataProcessor(object):
    def __init__(self):
        
        self.keep_cols = ['Date', 'Close_NVDA', 'High_NVDA', 'Low_NVDA','Open_NVDA', 'Volume_NVDA',
       'Close_SMH',  'Close_^GSPC', 'Volume_SMH', 'Volume_^GSPC', 'rsi', 'macd', 'macd_diff', 'stoch_k',
       'stoch_d', 'Month', 'Dayofweek', 'Year']
        self.skewed_cols = ['Volume_NVDA', 'Volume_SMH',
       'Volume_^GSPC', 'Close_SMH']
        
    def preprocess_data_for_training(self, df):
        '''
        Aplica transformaciónes matematicas a los datos y selecciona algunas columnas definidas en el notebook 03
        '''
        df = df.copy()
    
        df = df[self.keep_cols]
        df.dropna() #Botar los datos 
        
        #Transformar las columnas
        for col in self.skewed_cols:
            df[col] = np.log1p(df[col])
        #Aplicar logaritmica con signo
        df['macd'] = np.sign(df['macd']) * np.log1p(np.abs(df['macd']))


        return df

    def create_target(self, df):
        '''
        Función que crea el target que es precio de nvidia al dia siguente de los datos de cierre que tenemos
        '''
        df = df.copy() 
        df['target'] = df['Close_NVDA'].shift(-1)
        #df['target'] = np.expm1(df['target'])

        df.dropna(inplace=True) #Elimina el ultimo dato para el training pues no tiene target disponible

        return df
        
        
        

In [6]:
data_processor = DataProcessor()
data_process = data_processor.preprocess_data_for_training(nvda_df)

In [7]:
data_process.skew(numeric_only= True) #Para nuevos procesos de entrenamiento revisar esto

Close_NVDA      2.359190
High_NVDA       2.359441
Low_NVDA        2.362037
Open_NVDA       2.364269
Volume_NVDA     0.140079
Close_SMH       0.163365
Close_^GSPC     0.630853
Volume_SMH     -0.170950
Volume_^GSPC    0.383649
rsi            -0.088880
macd            0.971241
macd_diff      -0.324383
stoch_k        -0.449128
stoch_d        -0.448524
Month          -0.011926
Dayofweek      -0.011044
Year            0.004534
dtype: float64

In [8]:
data_w_target = data_processor.create_target(data_process)

In [9]:
data_w_target.head()

Unnamed: 0,Date,Close_NVDA,High_NVDA,Low_NVDA,Open_NVDA,Volume_NVDA,Close_SMH,Close_^GSPC,Volume_SMH,Volume_^GSPC,rsi,macd,macd_diff,stoch_k,stoch_d,Month,Dayofweek,Year,target
33,2014-04-17,0.439606,0.443633,0.435343,0.43629,18.84942,3.041551,1864.849976,15.51109,21.929665,53.730401,0.00132,0.000213,54.744368,53.964396,4,3,2014,0.443159
34,2014-04-21,0.443159,0.444343,0.437001,0.439843,18.577719,3.046816,1871.890015,14.96798,21.694991,56.305772,0.001754,0.000517,63.846261,57.217143,4,0,2014,0.446949
35,2014-04-22,0.446949,0.449791,0.441501,0.443869,19.357245,3.053514,1879.550049,14.835441,21.89123,58.931403,0.002375,0.000912,76.153434,64.914687,4,1,2014,0.452159
36,2014-04-23,0.452159,0.453344,0.446949,0.447659,19.382251,3.05038,1875.390015,14.640826,21.850051,62.287244,0.003248,0.00143,93.076427,77.69204,4,2,2014,0.456186
37,2014-04-24,0.456186,0.460923,0.455239,0.455239,19.785476,3.057471,1878.609985,15.112045,21.88386,64.688526,0.004215,0.001921,87.341788,85.523883,4,3,2014,0.443633


In [10]:
ABT_path = os.path.join('..', 'data', 'ABTs', 'principal_ABT.csv')
data_w_target.to_csv(ABT_path, index= 0)

In [11]:
data_w_target.shape

(2738, 19)