In [3]:
import pandas as pd
import numpy as np
import pywt
import matplotlib.pyplot as plt

In [4]:
dataframe = pd.read_csv("./Apple/AAPL.csv")

In [5]:
dataframe.head(20)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,1980-12-12,0.128348,0.128906,0.128348,0.128348,0.09945,469033600
1,1980-12-15,0.12221,0.12221,0.121652,0.121652,0.094261,175884800
2,1980-12-16,0.113281,0.113281,0.112723,0.112723,0.087343,105728000
3,1980-12-17,0.115513,0.116071,0.115513,0.115513,0.089504,86441600
4,1980-12-18,0.118862,0.11942,0.118862,0.118862,0.092099,73449600
5,1980-12-19,0.126116,0.126674,0.126116,0.126116,0.09772,48630400
6,1980-12-22,0.132254,0.132813,0.132254,0.132254,0.102476,37363200
7,1980-12-23,0.137835,0.138393,0.137835,0.137835,0.1068,46950400
8,1980-12-24,0.145089,0.145647,0.145089,0.145089,0.112421,48003200
9,1980-12-26,0.158482,0.15904,0.158482,0.158482,0.122799,55574400


In [6]:
dataframe['Volume']

0        469033600
1        175884800
2        105728000
3         86441600
4         73449600
           ...    
10775     58953100
10776     90370200
10777     84267900
10778     60895800
10779    109205100
Name: Volume, Length: 10780, dtype: int64

In [7]:
dataframe['Date'] = pd.to_datetime(dataframe['Date'])

In [8]:
dataframe.isnull().values.any()

False

In [9]:
dataframe.describe()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
count,10780.0,10780.0,10780.0,10780.0,10780.0,10780.0
mean,18.955535,19.166363,18.751091,18.966825,18.235953,323297000.0
std,39.869572,40.324723,39.43975,39.900695,39.516473,336776600.0
min,0.049665,0.049665,0.049107,0.049107,0.03805,0.0
25%,0.292411,0.300223,0.284765,0.292411,0.240557,117796400.0
50%,0.502232,0.511161,0.493862,0.503348,0.413864,210957600.0
75%,18.500446,18.648571,18.301161,18.53125,16.033411,403299000.0
max,196.240005,198.229996,195.279999,196.449997,196.185074,7421641000.0


In [10]:
dataframe['Date'] = pd.to_datetime(dataframe['Date'])

In [13]:
import pandas as pd
import numpy as np

# Read the CSV
# dataframe = pd.read_csv('./Apple/train_dataset_Apple.csv')
assert 'Date' in dataframe.columns and 'Adj Close' in dataframe.columns and 'Volume' in dataframe.columns, "The CSV should have 'Date', 'Adj Close', and 'Volume' columns"

# Convert the 'Date' column to datetime format
dataframe['Date'] = pd.to_datetime(dataframe['Date'])

# Function to create the dataset with a separate Volume column
def create_dataset_with_separate_volume(dataframe, look_back, label_window):
    price_features, volume_features, y, dates = [], [], [], []
    for i in range(len(dataframe) - look_back - label_window + 1):
        price_features.append(dataframe['Adj Close'].iloc[i:(i+look_back)].values)
        volume_features.append(dataframe['Volume'].iloc[i:(i+look_back)].values)
        dates.append(dataframe['Date'].iloc[i + look_back - 1])  # Store the last date of the look_back window

        # Label determination (Up/Down)
        if dataframe['Adj Close'].iloc[i + look_back + label_window - 1] > dataframe['Adj Close'].iloc[i + look_back - 1]:
            y.append('Up')
        else:
            y.append('Down')
    return np.array(price_features), np.array(volume_features), np.array(y), dates

# Using the create_dataset_with_separate_volume function
look_back = 60
label_window = 5

price_X, volume_X, y, date_list = create_dataset_with_separate_volume(dataframe, look_back, label_window)

# Preparing the dataset for saving to CSV
# Handling the multidimensional nature of 'price_features' and 'volume_features'
price_features_list = [price_X[i].tolist() for i in range(len(price_X))]
volume_features_list = [volume_X[i].tolist() for i in range(len(volume_X))]

dataset_df = pd.DataFrame({
    'Date': date_list,
    'price_features': price_features_list,
    'volume_features': volume_features_list,
    'labels': y
})

# # Saving the dataset to a CSV file
# dataset_df.to_csv('./Apple/dataset_lookback_60_labelwindow_5_separate_volume.csv', index=False)


In [16]:
# Split data before 2019 for training/validation
train_df = dataset_df[dataset_df['Date'] <= '2018-12-31']

# Data after 2018/12/31 for future validation
future_validation_df = dataset_df[dataset_df['Date'] > '2018-12-31']


In [17]:
train_df

Unnamed: 0,Date,price_features,volume_features,labels
0,1981-03-10,"[0.09945, 0.094261, 0.087343, 0.089504, 0.0920...","[469033600, 175884800, 105728000, 86441600, 73...",Up
1,1981-03-11,"[0.094261, 0.087343, 0.089504, 0.092099, 0.097...","[175884800, 105728000, 86441600, 73449600, 486...",Up
2,1981-03-12,"[0.087343, 0.089504, 0.092099, 0.09772, 0.1024...","[105728000, 86441600, 73449600, 48630400, 3736...",Up
3,1981-03-13,"[0.089504, 0.092099, 0.09772, 0.102476, 0.1068...","[86441600, 73449600, 48630400, 37363200, 46950...",Up
4,1981-03-16,"[0.092099, 0.09772, 0.102476, 0.1068, 0.112421...","[73449600, 48630400, 37363200, 46950400, 48003...",Up
...,...,...,...,...
9531,2018-12-24,"[54.049706, 54.413639, 54.897301, 55.565311, 5...","[91717600, 94403200, 99152800, 114619200, 1281...",Up
9532,2018-12-26,"[54.413639, 54.897301, 55.565311, 54.588432, 5...","[94403200, 99152800, 114619200, 128168000, 134...",Down
9533,2018-12-27,"[54.897301, 55.565311, 54.588432, 53.702515, 5...","[99152800, 114619200, 128168000, 134322000, 11...",Down
9534,2018-12-28,"[55.565311, 54.588432, 53.702515, 53.578033, 5...","[114619200, 128168000, 134322000, 118655600, 1...",Down


In [19]:
train_df['labels'].value_counts()

Up      5052
Down    4484
Name: labels, dtype: int64

In [20]:
# Save to new CSV files
train_df.to_csv('train_dataset_Apple_with_volume.csv', index=False)
# future_validation_df.to_csv('future_validation_dataset_Apple.csv', index=False)