***CATEGORICAL AND CONTINUOUS DATA***

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import seaborn as sns
%matplotlib inline

import warnings 
warnings.filterwarnings('ignore')

In [3]:
import yfinance as yf

In [5]:
stock = 'AAPL'
start = '2004-01-01'
end = '2024-01-01'

dataset = yf.download(stock, start, end)
dataset.columns = dataset.columns.get_level_values(0)
dataset.head()

[*********************100%***********************]  1 of 1 completed


Price,Close,High,Low,Open,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2004-01-02,0.319777,0.326839,0.318274,0.323834,144642400
2004-01-05,0.333151,0.336456,0.32188,0.32188,395018400
2004-01-06,0.331948,0.336907,0.326239,0.334353,509348000
2004-01-07,0.339462,0.343069,0.329544,0.332099,586874400
2004-01-08,0.351033,0.356593,0.340363,0.343219,460303200


In [7]:
# Create Data
dataset['Open_Close'] = (dataset['Open'] - dataset['Close'])/dataset['Open']
dataset['High_Low'] = (dataset['High'] - dataset['Low'])/dataset['Low']
dataset['Increase_Decrease'] = np.where(dataset['Volume'].shift(-1) > dataset['Volume'],1,0)
dataset['Buy_Sell_on_Open'] = np.where(dataset['Open'].shift(-1) > dataset['Open'],1,0)
dataset['Buy_Sell'] = np.where(dataset['Close'].shift(-1) > dataset['Close'],1,0)
dataset['Returns'] = dataset['Close'].pct_change()
dataset = dataset.dropna()
dataset.head()

Price,Close,High,Low,Open,Volume,Open_Close,High_Low,Increase_Decrease,Buy_Sell_on_Open,Buy_Sell,Returns
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2004-01-05,0.333151,0.336456,0.32188,0.32188,395018400,-0.035014,0.045284,1,1,0,0.041823
2004-01-06,0.331948,0.336907,0.326239,0.334353,509348000,0.007191,0.032702,1,0,1,-0.003609
2004-01-07,0.339462,0.343069,0.329544,0.332099,586874400,-0.022172,0.041041,0,1,1,0.022636
2004-01-08,0.351033,0.356593,0.340363,0.343219,460303200,-0.022768,0.047683,0,1,0,0.034086
2004-01-09,0.345623,0.362604,0.342467,0.349079,427459200,0.009901,0.058799,1,1,1,-0.015412


In [8]:
# Create Labels string
dataset['Increase_Decrease_L'] = np.where(dataset['Volume'].shift(-1) > dataset['Volume'],'Increase','Decrease')
dataset['Buy_Sell_on_Open_L'] = np.where(dataset['Open'].shift(-1) > dataset['Open'],'Buy','Sell')
dataset['Buy_Sell_L'] = np.where(dataset['Close'].shift(-1) > dataset['Close'],'Buy','Sell')
dataset.head()

Price,Close,High,Low,Open,Volume,Open_Close,High_Low,Increase_Decrease,Buy_Sell_on_Open,Buy_Sell,Returns,Increase_Decrease_L,Buy_Sell_on_Open_L,Buy_Sell_L
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2004-01-05,0.333151,0.336456,0.32188,0.32188,395018400,-0.035014,0.045284,1,1,0,0.041823,Increase,Buy,Sell
2004-01-06,0.331948,0.336907,0.326239,0.334353,509348000,0.007191,0.032702,1,0,1,-0.003609,Increase,Sell,Buy
2004-01-07,0.339462,0.343069,0.329544,0.332099,586874400,-0.022172,0.041041,0,1,1,0.022636,Decrease,Buy,Buy
2004-01-08,0.351033,0.356593,0.340363,0.343219,460303200,-0.022768,0.047683,0,1,0,0.034086,Decrease,Buy,Sell
2004-01-09,0.345623,0.362604,0.342467,0.349079,427459200,0.009901,0.058799,1,1,1,-0.015412,Increase,Buy,Buy


In [9]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5032 entries, 2004-01-05 to 2023-12-29
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Close                5032 non-null   float64
 1   High                 5032 non-null   float64
 2   Low                  5032 non-null   float64
 3   Open                 5032 non-null   float64
 4   Volume               5032 non-null   int64  
 5   Open_Close           5032 non-null   float64
 6   High_Low             5032 non-null   float64
 7   Increase_Decrease    5032 non-null   int64  
 8   Buy_Sell_on_Open     5032 non-null   int64  
 9   Buy_Sell             5032 non-null   int64  
 10  Returns              5032 non-null   float64
 11  Increase_Decrease_L  5032 non-null   object 
 12  Buy_Sell_on_Open_L   5032 non-null   object 
 13  Buy_Sell_L           5032 non-null   object 
dtypes: float64(7), int64(4), object(3)
memory usage: 589.7+ KB


In [10]:
data_feat = dataset.drop(columns=['Buy_Sell'], axis=1)
data_label = dataset['Buy_Sell']

In [11]:
#first just take a look at all the columns
list(data_feat.columns)

['Close',
 'High',
 'Low',
 'Open',
 'Volume',
 'Open_Close',
 'High_Low',
 'Increase_Decrease',
 'Buy_Sell_on_Open',
 'Returns',
 'Increase_Decrease_L',
 'Buy_Sell_on_Open_L',
 'Buy_Sell_L']

In [14]:
categorical_columns = [col for col in data_feat.columns if len(data_feat[col].unique()) == 2 or data_feat[col].dtype == 'O']

In [15]:
continuous_columns = [col for col in data_feat.columns if len(data_feat[col].unique()) > 2 and (data_feat[col].dtype == 'int64' or data_feat[col].dtype == 'float64')]

In [16]:
print("Categorical Variables:")
print("categorical columns : ", categorical_columns)
print('-'*50)
print("Continuous Variables:")
print("continuous columns : ", continuous_columns)

Categorical Variables:
categorical columns :  ['Increase_Decrease', 'Buy_Sell_on_Open', 'Increase_Decrease_L', 'Buy_Sell_on_Open_L', 'Buy_Sell_L']
--------------------------------------------------
Continuous Variables:
continuous columns :  ['Close', 'High', 'Low', 'Open', 'Volume', 'Open_Close', 'High_Low', 'Returns']
