In [1]:
# Import the required libraries
import pandas as pd
import numpy as np
from scipy.stats.mstats import winsorize
from sklearn.preprocessing import MinMaxScaler

In [2]:
# Import data from compiled.csv created by qtdata.ipynb
df = pd.read_csv('compiled.csv', index_col=0)

In [3]:
# Check data
print(df.info())
print(df.describe())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
Index: 1296 entries, 0 to 1295
Data columns (total 40 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Date            1296 non-null   object 
 1   000660.KS       1296 non-null   float64
 2   005930.KS       1296 non-null   float64
 3   AMD             1296 non-null   float64
 4   AMZN            1296 non-null   float64
 5   ASML            1296 non-null   float64
 6   GOOG            1296 non-null   float64
 7   INTC            1296 non-null   float64
 8   MSFT            1296 non-null   float64
 9   MU              1296 non-null   float64
 10  NVDA            1296 non-null   float64
 11  TSM             1296 non-null   float64
 12  ^GSPC           1296 non-null   float64
 13  ^IXIC           1296 non-null   float64
 14  ^SOX            1296 non-null   float64
 15  000660.KS_chg   1295 non-null   float64
 16  005930.KS_chg   1295 non-null   float64
 17  AMD_chg         1295 non-null   float6

In [4]:
# Convert Date to datetime
df['Date'] = pd.to_datetime(df['Date'])

In [5]:
# Create list of column names
column_names = [col for col in df.columns]

# Create first batch of columns to drop
columns_drop1 = column_names[7:8] + column_names[15:37]

# Drop columns
df = df.drop(columns_drop1, axis=1)

# Print to check
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 1296 entries, 0 to 1295
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Date           1296 non-null   datetime64[ns]
 1   000660.KS      1296 non-null   float64       
 2   005930.KS      1296 non-null   float64       
 3   AMD            1296 non-null   float64       
 4   AMZN           1296 non-null   float64       
 5   ASML           1296 non-null   float64       
 6   GOOG           1296 non-null   float64       
 7   MSFT           1296 non-null   float64       
 8   MU             1296 non-null   float64       
 9   NVDA           1296 non-null   float64       
 10  TSM            1296 non-null   float64       
 11  ^GSPC          1296 non-null   float64       
 12  ^IXIC          1296 non-null   float64       
 13  ^SOX           1296 non-null   float64       
 14  Interest Rate  1296 non-null   float64       
 15  50D SMA        1257 non-nu

In [6]:
# Create list of column names
pvd_names = column_names[1:7] + column_names[8:15] + column_names[37:40]

# Create columns for previous day
for i, column in enumerate(pvd_names):
    df[f'{column}_pvd'] = df[column].shift(1)

# Print to check
print(df.info())
display(df.head())

<class 'pandas.core.frame.DataFrame'>
Index: 1296 entries, 0 to 1295
Data columns (total 33 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   Date               1296 non-null   datetime64[ns]
 1   000660.KS          1296 non-null   float64       
 2   005930.KS          1296 non-null   float64       
 3   AMD                1296 non-null   float64       
 4   AMZN               1296 non-null   float64       
 5   ASML               1296 non-null   float64       
 6   GOOG               1296 non-null   float64       
 7   MSFT               1296 non-null   float64       
 8   MU                 1296 non-null   float64       
 9   NVDA               1296 non-null   float64       
 10  TSM                1296 non-null   float64       
 11  ^GSPC              1296 non-null   float64       
 12  ^IXIC              1296 non-null   float64       
 13  ^SOX               1296 non-null   float64       
 14  Interest Rate

Unnamed: 0,Date,000660.KS,005930.KS,AMD,AMZN,ASML,GOOG,MSFT,MU,NVDA,...,MSFT_pvd,MU_pvd,NVDA_pvd,TSM_pvd,^GSPC_pvd,^IXIC_pvd,^SOX_pvd,Interest Rate_pvd,50D SMA_pvd,200D SMA_pvd
0,2019-05-10,70162.523438,37687.65625,27.959999,94.499001,189.950531,58.213501,121.014084,38.244068,41.918369,...,,,,,,,,,,
1,2019-05-13,69313.796875,37468.03125,26.24,91.134003,182.017609,56.601501,117.415924,36.711952,39.343468,...,121.014084,38.244068,41.918369,37.501423,2881.399902,7916.939941,1478.869995,2.38,43.9478,48.7636
2,2019-05-14,70351.148438,37468.03125,27.32,92.005997,189.281448,56.021999,118.729538,37.821766,40.234871,...,117.415924,36.711952,39.343468,35.890335,2811.870117,7647.02002,1408.949951,2.38,43.9837,48.6581
3,2019-05-15,70256.835938,37380.1875,27.58,93.557503,191.756927,58.210499,120.40152,37.723545,39.62405,...,118.729538,37.821766,40.234871,36.748409,2834.409912,7734.490234,1442.829956,2.38,43.9939,48.546
4,2019-05-16,67804.921875,36501.683594,28.01,95.378502,193.20015,58.949001,123.181755,36.6432,39.77552,...,120.40152,37.723545,39.62405,36.748409,2850.959961,7822.149902,1454.5,2.4,44.012,48.427


In [7]:
# Create second batch of columns to drop 
pvd_names.remove('NVDA')

# Drop second batch of columns - price columns
df = df.drop(pvd_names,axis=1)

# Print to check
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 1296 entries, 0 to 1295
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   Date               1296 non-null   datetime64[ns]
 1   NVDA               1296 non-null   float64       
 2   000660.KS_pvd      1295 non-null   float64       
 3   005930.KS_pvd      1295 non-null   float64       
 4   AMD_pvd            1295 non-null   float64       
 5   AMZN_pvd           1295 non-null   float64       
 6   ASML_pvd           1295 non-null   float64       
 7   GOOG_pvd           1295 non-null   float64       
 8   MSFT_pvd           1295 non-null   float64       
 9   MU_pvd             1295 non-null   float64       
 10  NVDA_pvd           1295 non-null   float64       
 11  TSM_pvd            1295 non-null   float64       
 12  ^GSPC_pvd          1295 non-null   float64       
 13  ^IXIC_pvd          1295 non-null   float64       
 14  ^SOX_pvd     

In [8]:
# Forward fill nulls for SMAs
df[['50D SMA_pvd', '200D SMA_pvd']] = df[['50D SMA_pvd', '200D SMA_pvd']].ffill(axis=0)

# Print to check
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
Index: 1296 entries, 0 to 1295
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   Date               1296 non-null   datetime64[ns]
 1   NVDA               1296 non-null   float64       
 2   000660.KS_pvd      1295 non-null   float64       
 3   005930.KS_pvd      1295 non-null   float64       
 4   AMD_pvd            1295 non-null   float64       
 5   AMZN_pvd           1295 non-null   float64       
 6   ASML_pvd           1295 non-null   float64       
 7   GOOG_pvd           1295 non-null   float64       
 8   MSFT_pvd           1295 non-null   float64       
 9   MU_pvd             1295 non-null   float64       
 10  NVDA_pvd           1295 non-null   float64       
 11  TSM_pvd            1295 non-null   float64       
 12  ^GSPC_pvd          1295 non-null   float64       
 13  ^IXIC_pvd          1295 non-null   float64       
 14  ^SOX_pvd     

In [9]:
# Drop residual nulls
df.dropna(inplace=True)

# Print to check
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 1295 entries, 1 to 1295
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   Date               1295 non-null   datetime64[ns]
 1   NVDA               1295 non-null   float64       
 2   000660.KS_pvd      1295 non-null   float64       
 3   005930.KS_pvd      1295 non-null   float64       
 4   AMD_pvd            1295 non-null   float64       
 5   AMZN_pvd           1295 non-null   float64       
 6   ASML_pvd           1295 non-null   float64       
 7   GOOG_pvd           1295 non-null   float64       
 8   MSFT_pvd           1295 non-null   float64       
 9   MU_pvd             1295 non-null   float64       
 10  NVDA_pvd           1295 non-null   float64       
 11  TSM_pvd            1295 non-null   float64       
 12  ^GSPC_pvd          1295 non-null   float64       
 13  ^IXIC_pvd          1295 non-null   float64       
 14  ^SOX_pvd     

In [10]:
# Check data statistics before Winsorisation
print(df.describe())

                                Date         NVDA  000660.KS_pvd  \
count                           1295  1295.000000    1295.000000   
mean   2021-11-07 19:03:06.254826240   228.308272  102327.054202   
min              2019-05-13 00:00:00    33.255775   59506.125000   
25%              2020-08-10 12:00:00   112.363861   80727.453125   
50%              2021-11-09 00:00:00   165.487259   96285.210938   
75%              2023-02-04 12:00:00   274.092072  118912.234375   
max              2024-05-07 00:00:00   950.020020  188400.000000   
std                              NaN   192.968385   25087.640616   

       005930.KS_pvd      AMD_pvd     AMZN_pvd     ASML_pvd     GOOG_pvd  \
count    1295.000000  1295.000000  1295.000000  1295.000000  1295.000000   
mean    61841.672765    88.336517   133.521317   536.966141   105.119391   
min     36194.207031    26.240000    81.820000   178.710602    51.811501   
25%     52908.197266    58.920000   101.130001   355.026459    75.580250   
50%    

In [11]:
# Create list of columns to Winsorise
columns_win = ['NVDA', '50D SMA_pvd', '200D SMA_pvd',
               '000660.KS_pvd', 'AMD_pvd', 'MU_pvd',
               'NVDA_pvd']

# Apply Winsorisation
df[columns_win] = df[columns_win].apply(lambda x: winsorize(x, limits=[0, 0.03]))

# Print to check
print(df.describe())

                                Date         NVDA  000660.KS_pvd  \
count                           1295  1295.000000    1295.000000   
mean   2021-11-07 19:03:06.254826240   227.177254  101966.162913   
min              2019-05-13 00:00:00    33.255775   59506.125000   
25%              2020-08-10 12:00:00   112.363861   80727.453125   
50%              2021-11-09 00:00:00   165.487259   96285.210938   
75%              2023-02-04 12:00:00   274.092072  118912.234375   
max              2024-05-07 00:00:00   852.330017  164626.984375   
std                              NaN   189.109033   24051.443582   

       005930.KS_pvd      AMD_pvd     AMZN_pvd     ASML_pvd     GOOG_pvd  \
count    1295.000000  1295.000000  1295.000000  1295.000000  1295.000000   
mean    61841.672765    87.990286   133.521317   536.966141   105.119391   
min     36194.207031    26.240000    81.820000   178.710602    51.811501   
25%     52908.197266    58.920000   101.130001   355.026459    75.580250   
50%    

In [12]:
# Create list of column names without Date
updcolumn_names = [col for col in df.columns]
columns_nodate = updcolumn_names[1:18]

# Create new DataFrame excluding Date column
nodate_df = df.drop('Date', axis=1)

# Create new DataFrame with only Date column
date_df = df['Date']

# Set MinMax scaling range between 0 and 1 
scaler = MinMaxScaler(feature_range=(0, 1)) 

# Apply MinMax scaling
scaled_data = scaler.fit_transform(nodate_df)

# Convert scaled data back to DataFrame
scaled_df = pd.DataFrame(scaled_data, columns=columns_nodate)

In [13]:
# Reset indexes
date_df = date_df.reset_index(drop=True)
scaled_df = scaled_df.reset_index(drop=True)

# Print to check
print(date_df.info())
print(scaled_df.info())

<class 'pandas.core.series.Series'>
RangeIndex: 1295 entries, 0 to 1294
Series name: Date
Non-Null Count  Dtype         
--------------  -----         
1295 non-null   datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 10.2 KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1295 entries, 0 to 1294
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   NVDA               1295 non-null   float64
 1   000660.KS_pvd      1295 non-null   float64
 2   005930.KS_pvd      1295 non-null   float64
 3   AMD_pvd            1295 non-null   float64
 4   AMZN_pvd           1295 non-null   float64
 5   ASML_pvd           1295 non-null   float64
 6   GOOG_pvd           1295 non-null   float64
 7   MSFT_pvd           1295 non-null   float64
 8   MU_pvd             1295 non-null   float64
 9   NVDA_pvd           1295 non-null   float64
 10  TSM_pvd            1295 non-null   float64
 11  ^GSPC_pvd          1295 non-

In [14]:
# Combine date_df with scaled_df
final_df = pd.concat([date_df, scaled_df], axis=1)

# Print to check
print(final_df.info())
print(final_df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1295 entries, 0 to 1294
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   Date               1295 non-null   datetime64[ns]
 1   NVDA               1295 non-null   float64       
 2   000660.KS_pvd      1295 non-null   float64       
 3   005930.KS_pvd      1295 non-null   float64       
 4   AMD_pvd            1295 non-null   float64       
 5   AMZN_pvd           1295 non-null   float64       
 6   ASML_pvd           1295 non-null   float64       
 7   GOOG_pvd           1295 non-null   float64       
 8   MSFT_pvd           1295 non-null   float64       
 9   MU_pvd             1295 non-null   float64       
 10  NVDA_pvd           1295 non-null   float64       
 11  TSM_pvd            1295 non-null   float64       
 12  ^GSPC_pvd          1295 non-null   float64       
 13  ^IXIC_pvd          1295 non-null   float64       
 14  ^SOX_pvd

In [15]:
# Drop nulls
final_df.dropna(inplace=True)

# Print to check
print(final_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1295 entries, 0 to 1294
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   Date               1295 non-null   datetime64[ns]
 1   NVDA               1295 non-null   float64       
 2   000660.KS_pvd      1295 non-null   float64       
 3   005930.KS_pvd      1295 non-null   float64       
 4   AMD_pvd            1295 non-null   float64       
 5   AMZN_pvd           1295 non-null   float64       
 6   ASML_pvd           1295 non-null   float64       
 7   GOOG_pvd           1295 non-null   float64       
 8   MSFT_pvd           1295 non-null   float64       
 9   MU_pvd             1295 non-null   float64       
 10  NVDA_pvd           1295 non-null   float64       
 11  TSM_pvd            1295 non-null   float64       
 12  ^GSPC_pvd          1295 non-null   float64       
 13  ^IXIC_pvd          1295 non-null   float64       
 14  ^SOX_pvd

In [16]:
# Export df to csv
# final_df.to_csv('processed.csv')

In [17]:
# Import new data
ndata_df = pd.read_csv('newdata.csv', index_col=0)
print(ndata_df.info())
print(ndata_df)
print(ndata_df.describe())

<class 'pandas.core.frame.DataFrame'>
Index: 6 entries, 2 to 7
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Date               6 non-null      object 
 1   NVDA               6 non-null      float64
 2   000660.KS_pvd      6 non-null      float64
 3   005930.KS_pvd      6 non-null      float64
 4   AMD_pvd            6 non-null      float64
 5   AMZN_pvd           6 non-null      float64
 6   ASML_pvd           6 non-null      float64
 7   GOOG_pvd           6 non-null      float64
 8   MSFT_pvd           6 non-null      float64
 9   MU_pvd             6 non-null      float64
 10  NVDA_pvd           6 non-null      float64
 11  TSM_pvd            6 non-null      float64
 12  ^GSPC_pvd          6 non-null      float64
 13  ^IXIC_pvd          6 non-null      float64
 14  ^SOX_pvd           6 non-null      float64
 15  Interest Rate_pvd  6 non-null      float64
 16  50D SMA_pvd        6 non-null      

In [18]:
# Create list of column names
ndata_column_names = [col for col in ndata_df.columns]

# Create list of columns without date
ndata_columns_nodate = ndata_column_names[1:18]

In [19]:
# Create new DataFrame excluding Date column
ndata_df_nodate = ndata_df.drop('Date', axis=1)

# Create new DataFrame with only Date column
ndata_df_date = ndata_df['Date']

# Print to check
print(ndata_df_date.info())
print(ndata_df_nodate.info())

<class 'pandas.core.series.Series'>
Index: 6 entries, 2 to 7
Series name: Date
Non-Null Count  Dtype 
--------------  ----- 
6 non-null      object
dtypes: object(1)
memory usage: 96.0+ bytes
None
<class 'pandas.core.frame.DataFrame'>
Index: 6 entries, 2 to 7
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   NVDA               6 non-null      float64
 1   000660.KS_pvd      6 non-null      float64
 2   005930.KS_pvd      6 non-null      float64
 3   AMD_pvd            6 non-null      float64
 4   AMZN_pvd           6 non-null      float64
 5   ASML_pvd           6 non-null      float64
 6   GOOG_pvd           6 non-null      float64
 7   MSFT_pvd           6 non-null      float64
 8   MU_pvd             6 non-null      float64
 9   NVDA_pvd           6 non-null      float64
 10  TSM_pvd            6 non-null      float64
 11  ^GSPC_pvd          6 non-null      float64
 12  ^IXIC_pvd          6 non-null 

In [20]:
# Scale new data
nscaled_data = scaler.transform(ndata_df_nodate)

# Convert scaled data back to DataFrame
nscaled_df = pd.DataFrame(nscaled_data, columns=ndata_columns_nodate)

In [21]:
# Reset indexes
ndata_df_date = ndata_df_date.reset_index(drop=True)
nscaled_df = nscaled_df.reset_index(drop=True)

# Print to check
print(ndata_df_date.info())
print(nscaled_df.info())

<class 'pandas.core.series.Series'>
RangeIndex: 6 entries, 0 to 5
Series name: Date
Non-Null Count  Dtype 
--------------  ----- 
6 non-null      object
dtypes: object(1)
memory usage: 180.0+ bytes
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   NVDA               6 non-null      float64
 1   000660.KS_pvd      6 non-null      float64
 2   005930.KS_pvd      6 non-null      float64
 3   AMD_pvd            6 non-null      float64
 4   AMZN_pvd           6 non-null      float64
 5   ASML_pvd           6 non-null      float64
 6   GOOG_pvd           6 non-null      float64
 7   MSFT_pvd           6 non-null      float64
 8   MU_pvd             6 non-null      float64
 9   NVDA_pvd           6 non-null      float64
 10  TSM_pvd            6 non-null      float64
 11  ^GSPC_pvd          6 non-null      float64
 12  ^IXIC_pvd          

In [22]:
# Combine ndata_df_date with nscaled_df
ndata_final_df = pd.concat([ndata_df_date, nscaled_df], axis=1)

# Print to check
print(ndata_final_df.info())
print(ndata_final_df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Date               6 non-null      object 
 1   NVDA               6 non-null      float64
 2   000660.KS_pvd      6 non-null      float64
 3   005930.KS_pvd      6 non-null      float64
 4   AMD_pvd            6 non-null      float64
 5   AMZN_pvd           6 non-null      float64
 6   ASML_pvd           6 non-null      float64
 7   GOOG_pvd           6 non-null      float64
 8   MSFT_pvd           6 non-null      float64
 9   MU_pvd             6 non-null      float64
 10  NVDA_pvd           6 non-null      float64
 11  TSM_pvd            6 non-null      float64
 12  ^GSPC_pvd          6 non-null      float64
 13  ^IXIC_pvd          6 non-null      float64
 14  ^SOX_pvd           6 non-null      float64
 15  Interest Rate_pvd  6 non-null      float64
 16  50D SMA_pvd        6 non-null 

In [24]:
# Export df to csv
ndata_final_df.to_csv('processed_new.csv')