In [1]:
pip install yfinance --upgrade --no-cache-dir

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install alpha-vantage

Note: you may need to restart the kernel to use updated packages.


In [3]:
# Import relevant libraries
import numpy as np
import pandas as pd
import yfinance as yf
import requests
from datetime import datetime, timedelta
from scipy.stats.mstats import winsorize

In [4]:
# Set tickers
ticker_target = 'NVDA'
ticker_comp = 'AMD 005930.KS' # AMD, Samsung
ticker_supp = 'TSM ASML 000660.KS MU' # TSMC, ASML, SK Hynix
ticker_other = 'AMZN MSFT GOOG' # Amazon, Microsoft, Google
ticker_mkt = '^IXIC ^GSPC ^SOX'# Indices - Nasdaq composite, S&P500, PHLX Semiconductor
ticker_all  = " ".join([ticker_target, ticker_comp, ticker_supp, ticker_other, ticker_mkt])

# Print to check 
print(ticker_all)

NVDA AMD 005930.KS TSM ASML 000660.KS MU AMZN MSFT GOOG ^IXIC ^GSPC ^SOX


In [6]:
# Set end date
end = datetime(2019, 5, 10).date()
end_date = end.strftime('%Y-%m-%d')

# Set start date
timeframe = 365 * 5 # Number of days 
start = end - timedelta(days=timeframe)
start_date = start.strftime('%Y-%m-%d')

# Print to check
print(end_date)
print(start_date)

2019-05-10
2014-05-11


In [7]:
# Load stock data into a DataFrame
df = yf.download(ticker_all, start=start_date, end=end_date)

# Print to check
print(df.tail())
print(df.info())

[*********************100%%**********************]  13 of 13 completed

Price          Adj Close                                                  \
Ticker         000660.KS     005930.KS        AMD       AMZN        ASML   
Date                                                                       
2019-05-03  75820.804688  39796.058594  28.219999  98.123001  199.450897   
2019-05-06           NaN           NaN  27.420000  97.527496  196.096161   
2019-05-07  75349.289062  39400.734375  26.660000  96.050003  191.814316   
2019-05-08  75726.484375  38873.632812  27.090000  95.888496  192.521561   
2019-05-09  71671.390625  37292.335938  27.209999  94.993500  190.189484   

Price                                                               ...  \
Ticker           GOOG        MSFT         MU       NVDA        TSM  ...   
Date                                                                ...   
2019-05-03  59.270000  122.698921  42.555614  45.441769  39.147530  ...   
2019-05-06  59.469501  121.985001  41.377071  44.657127  38.315704  ...   
2019-05-07  58.7




In [8]:
# Create new DataFrame with only close prices
new_df = df['Adj Close']

# Print to check
print(new_df.tail())
print(new_df.info())

Ticker         000660.KS     005930.KS        AMD       AMZN        ASML  \
Date                                                                       
2019-05-03  75820.804688  39796.058594  28.219999  98.123001  199.450897   
2019-05-06           NaN           NaN  27.420000  97.527496  196.096161   
2019-05-07  75349.289062  39400.734375  26.660000  96.050003  191.814316   
2019-05-08  75726.484375  38873.632812  27.090000  95.888496  192.521561   
2019-05-09  71671.390625  37292.335938  27.209999  94.993500  190.189484   

Ticker           GOOG        MSFT         MU       NVDA        TSM  \
Date                                                                 
2019-05-03  59.270000  122.698921  42.555614  45.441769  39.147530   
2019-05-06  59.469501  121.985001  41.377071  44.657127  38.315704   
2019-05-07  58.705002  119.481522  39.540474  42.983585  38.315704   
2019-05-08  58.313499  119.472023  39.039589  43.184715  38.263172   
2019-05-09  58.118999  119.462486  38.568180  4

In [9]:
# Set url to retrieve data
url_ffr = 'https://www.alphavantage.co/query?function=FEDERAL_FUNDS_RATE&interval=daily&apikey=WTH3FILIUYV3ONU8'

# Retrieve data
r = requests.get(url_ffr)
ffr_data = r.json()

In [10]:
# Check keys
print(ffr_data.keys())

dict_keys(['name', 'interval', 'unit', 'data'])


In [11]:
# Convert json into a DataFrame
ffr_df = pd.DataFrame(ffr_data['data'])

# Print to check
print(ffr_df)
print(ffr_df.info())

             date  value
0      2024-05-13   5.33
1      2024-05-12   5.33
2      2024-05-11   5.33
3      2024-05-10   5.33
4      2024-05-09   5.33
...           ...    ...
25515  1954-07-05   0.88
25516  1954-07-04  1.250
25517  1954-07-03  1.250
25518  1954-07-02  1.250
25519  1954-07-01   1.13

[25520 rows x 2 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25520 entries, 0 to 25519
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   date    25520 non-null  object
 1   value   25520 non-null  object
dtypes: object(2)
memory usage: 398.9+ KB
None


In [12]:
# Convert Date to datetime format
ffr_df['date'] = pd.to_datetime(ffr_df['date'])

# Rename columns
ffr_df.rename(columns={'date':'Date','value':'Interest Rate'},inplace=True)

# Print to check
print(ffr_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25520 entries, 0 to 25519
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Date           25520 non-null  datetime64[ns]
 1   Interest Rate  25520 non-null  object        
dtypes: datetime64[ns](1), object(1)
memory usage: 398.9+ KB
None


In [13]:
# Merge Interest Rate column into new_df
new_df = pd.merge(new_df, ffr_df, on='Date', how='left')

# Print to check
print(new_df.info())
print(new_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1295 entries, 0 to 1294
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Date           1295 non-null   datetime64[ns]
 1   000660.KS      1224 non-null   float64       
 2   005930.KS      1224 non-null   float64       
 3   AMD            1258 non-null   float64       
 4   AMZN           1258 non-null   float64       
 5   ASML           1258 non-null   float64       
 6   GOOG           1258 non-null   float64       
 7   MSFT           1258 non-null   float64       
 8   MU             1258 non-null   float64       
 9   NVDA           1258 non-null   float64       
 10  TSM            1258 non-null   float64       
 11  ^GSPC          1258 non-null   float64       
 12  ^IXIC          1258 non-null   float64       
 13  ^SOX           1258 non-null   float64       
 14  Interest Rate  1295 non-null   object        
dtypes: datetime64[ns](1),

In [14]:
# Set url to retrieve data
url_sma50 = f'https://www.alphavantage.co/query?function=SMA&symbol=NVDA&interval=daily&time_period=50&series_type=open&apikey=WTH3FILIUYV3ONU8'

#Retrieve data
r = requests.get(url_sma50)
sma50_data = r.json()

In [15]:
# Check keys
print(sma50_data.keys())

dict_keys(['Meta Data', 'Technical Analysis: SMA'])


In [16]:
# Convert json into a DataFrame
sma50_df = pd.DataFrame(sma50_data['Technical Analysis: SMA'])

# Print to check
print(sma50_df)
print(sma50_df.info())

    2024-05-15 2024-05-14 2024-05-13 2024-05-10 2024-05-09 2024-05-08  \
SMA   882.6617   881.2157   880.1211   878.0248   875.7819   873.1994   

    2024-05-07 2024-05-06 2024-05-03 2024-05-02  ... 2000-01-25 2000-01-24  \
SMA   871.1784   868.8980   867.1773   864.6238  ...     0.7754     0.7699   

    2000-01-21 2000-01-20 2000-01-19 2000-01-18 2000-01-14 2000-01-13  \
SMA     0.7632     0.7570     0.7500     0.7439     0.7375     0.7311   

    2000-01-12 2000-01-11  
SMA     0.7248     0.7173  

[1 rows x 6125 columns]
<class 'pandas.core.frame.DataFrame'>
Index: 1 entries, SMA to SMA
Columns: 6125 entries, 2024-05-15 to 2000-01-11
dtypes: object(6125)
memory usage: 47.9+ KB
None


In [17]:
# Melt sma50_df
sma50_df = pd.melt(sma50_df)

# Print to check
print(sma50_df)
print(sma50_df.info())

        variable     value
0     2024-05-15  882.6617
1     2024-05-14  881.2157
2     2024-05-13  880.1211
3     2024-05-10  878.0248
4     2024-05-09  875.7819
...          ...       ...
6120  2000-01-18    0.7439
6121  2000-01-14    0.7375
6122  2000-01-13    0.7311
6123  2000-01-12    0.7248
6124  2000-01-11    0.7173

[6125 rows x 2 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6125 entries, 0 to 6124
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   variable  6125 non-null   object
 1   value     6125 non-null   object
dtypes: object(2)
memory usage: 95.8+ KB
None


In [18]:
# Rename columns
sma50_df.rename(columns={'variable':'Date','value':'50D SMA'},inplace=True)

# Convert Date to datetime format
sma50_df['Date'] = pd.to_datetime(sma50_df['Date'])

# Print to check
print(sma50_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6125 entries, 0 to 6124
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   Date     6125 non-null   datetime64[ns]
 1   50D SMA  6125 non-null   object        
dtypes: datetime64[ns](1), object(1)
memory usage: 95.8+ KB
None


In [19]:
# Set url to retrieve data
url_sma200 = f'https://www.alphavantage.co/query?function=SMA&symbol=NVDA&interval=daily&time_period=200&series_type=open&apikey=WTH3FILIUYV3ONU8'

# Retrieve data
r = requests.get(url_sma200)
sma200_data = r.json()

In [20]:
# Convert json into a DataFrame
sma200_df = pd.DataFrame(sma200_data['Technical Analysis: SMA'])

# Print to check
print(sma200_df)
print(sma200_df.info())

    2024-05-15 2024-05-14 2024-05-13 2024-05-10 2024-05-09 2024-05-08  \
SMA   606.6404   604.3526   602.2055   600.0071   597.7924   595.5125   

    2024-05-07 2024-05-06 2024-05-03 2024-05-02  ... 2000-08-28 2000-08-25  \
SMA   593.2744   591.0084   588.8638   586.8470  ...     1.6549     1.6432   

    2000-08-24 2000-08-23 2000-08-22 2000-08-21 2000-08-18 2000-08-17  \
SMA     1.6314     1.6200     1.6093     1.5980     1.5854     1.5737   

    2000-08-16 2000-08-15  
SMA     1.5622     1.5500  

[1 rows x 5975 columns]
<class 'pandas.core.frame.DataFrame'>
Index: 1 entries, SMA to SMA
Columns: 5975 entries, 2024-05-15 to 2000-08-15
dtypes: object(5975)
memory usage: 46.7+ KB
None


In [21]:
# Melt sma200_df
sma200_df = pd.melt(sma200_df)

# Print to check
print(sma200_df)
print(sma200_df.info())

        variable     value
0     2024-05-15  606.6404
1     2024-05-14  604.3526
2     2024-05-13  602.2055
3     2024-05-10  600.0071
4     2024-05-09  597.7924
...          ...       ...
5970  2000-08-21    1.5980
5971  2000-08-18    1.5854
5972  2000-08-17    1.5737
5973  2000-08-16    1.5622
5974  2000-08-15    1.5500

[5975 rows x 2 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5975 entries, 0 to 5974
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   variable  5975 non-null   object
 1   value     5975 non-null   object
dtypes: object(2)
memory usage: 93.5+ KB
None


In [22]:
# Rename columns
sma200_df.rename(columns={'variable':'Date','value':'200D SMA'},inplace=True)

# Convert Date to datetime format
sma200_df['Date'] = pd.to_datetime(sma200_df['Date'])

# Print to check
print(sma200_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5975 entries, 0 to 5974
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   Date      5975 non-null   datetime64[ns]
 1   200D SMA  5975 non-null   object        
dtypes: datetime64[ns](1), object(1)
memory usage: 93.5+ KB
None


In [23]:
# Merge 50d SMA and 200d SMA columns into new_df
new_df = pd.merge(new_df, sma50_df, on='Date', how='left')
new_df = pd.merge(new_df, sma200_df, on='Date', how='left')

# Print to check
print(new_df.info())
print(new_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1295 entries, 0 to 1294
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Date           1295 non-null   datetime64[ns]
 1   000660.KS      1224 non-null   float64       
 2   005930.KS      1224 non-null   float64       
 3   AMD            1258 non-null   float64       
 4   AMZN           1258 non-null   float64       
 5   ASML           1258 non-null   float64       
 6   GOOG           1258 non-null   float64       
 7   MSFT           1258 non-null   float64       
 8   MU             1258 non-null   float64       
 9   NVDA           1258 non-null   float64       
 10  TSM            1258 non-null   float64       
 11  ^GSPC          1258 non-null   float64       
 12  ^IXIC          1258 non-null   float64       
 13  ^SOX           1258 non-null   float64       
 14  Interest Rate  1295 non-null   object        
 15  50D SMA        1258 n

In [26]:
# Forward fill price and SMA columns
new_df = new_df.ffill(axis=0)

In [27]:
# Print to check
print(new_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1295 entries, 0 to 1294
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Date           1295 non-null   datetime64[ns]
 1   000660.KS      1295 non-null   float64       
 2   005930.KS      1295 non-null   float64       
 3   AMD            1295 non-null   float64       
 4   AMZN           1295 non-null   float64       
 5   ASML           1295 non-null   float64       
 6   GOOG           1295 non-null   float64       
 7   MSFT           1295 non-null   float64       
 8   MU             1295 non-null   float64       
 9   NVDA           1295 non-null   float64       
 10  TSM            1295 non-null   float64       
 11  ^GSPC          1295 non-null   float64       
 12  ^IXIC          1295 non-null   float64       
 13  ^SOX           1295 non-null   float64       
 14  Interest Rate  1295 non-null   object        
 15  50D SMA        1295 n

In [28]:
# Create list of column names without date
column_names = list(new_df.select_dtypes(include=['float64', 'object']))

# Create columns for previous day
for i, column in enumerate(column_names):
    new_df[f'{column}_pvd'] = new_df[column].shift(1)

# Print to check
print(new_df.info())
display(new_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1295 entries, 0 to 1294
Data columns (total 33 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   Date               1295 non-null   datetime64[ns]
 1   000660.KS          1295 non-null   float64       
 2   005930.KS          1295 non-null   float64       
 3   AMD                1295 non-null   float64       
 4   AMZN               1295 non-null   float64       
 5   ASML               1295 non-null   float64       
 6   GOOG               1295 non-null   float64       
 7   MSFT               1295 non-null   float64       
 8   MU                 1295 non-null   float64       
 9   NVDA               1295 non-null   float64       
 10  TSM                1295 non-null   float64       
 11  ^GSPC              1295 non-null   float64       
 12  ^IXIC              1295 non-null   float64       
 13  ^SOX               1295 non-null   float64       
 14  Interest

Unnamed: 0,Date,000660.KS,005930.KS,AMD,AMZN,ASML,GOOG,MSFT,MU,NVDA,...,MSFT_pvd,MU_pvd,NVDA_pvd,TSM_pvd,^GSPC_pvd,^IXIC_pvd,^SOX_pvd,Interest Rate_pvd,50D SMA_pvd,200D SMA_pvd
0,2014-05-12,36227.230469,21956.066406,3.97,15.143,74.723244,26.423454,33.846798,27.018345,4.401841,...,,,,,,,,,,
1,2014-05-13,36489.75,22145.890625,4.03,15.232,73.877426,26.58152,34.469318,27.136204,4.330766,...,33.846798,27.018345,4.401841,15.387698,1896.650024,4143.859863,586.47998,0.08,4.3683,3.8441
2,2014-05-14,36839.769531,22383.169922,3.98,14.881,73.659134,26.260403,34.315819,26.379965,4.288121,...,34.469318,27.136204,4.330766,15.42506,1897.449951,4130.169922,581.880005,0.09,4.37,3.8494
3,2014-05-15,36752.265625,22304.076172,3.96,14.7595,73.377213,25.927814,33.770039,25.397835,4.26443,...,34.315819,26.379965,4.288121,15.380223,1888.530029,4100.629883,576.799988,0.08,4.3696,3.8545
4,2014-05-16,36971.019531,22588.814453,4.02,14.885,73.731903,25.960226,33.966183,25.564798,4.254953,...,33.770039,25.397835,4.26443,15.440014,1870.849976,4069.290039,570.0,0.09,4.3677,3.8593


In [29]:
# Create list of columns to drop 
column_names.remove('NVDA')

# Drop columns 
new_df = new_df.drop(column_names,axis=1)

# Print to check
print(new_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1295 entries, 0 to 1294
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   Date               1295 non-null   datetime64[ns]
 1   NVDA               1295 non-null   float64       
 2   000660.KS_pvd      1294 non-null   float64       
 3   005930.KS_pvd      1294 non-null   float64       
 4   AMD_pvd            1294 non-null   float64       
 5   AMZN_pvd           1294 non-null   float64       
 6   ASML_pvd           1294 non-null   float64       
 7   GOOG_pvd           1294 non-null   float64       
 8   MSFT_pvd           1294 non-null   float64       
 9   MU_pvd             1294 non-null   float64       
 10  NVDA_pvd           1294 non-null   float64       
 11  TSM_pvd            1294 non-null   float64       
 12  ^GSPC_pvd          1294 non-null   float64       
 13  ^IXIC_pvd          1294 non-null   float64       
 14  ^SOX_pvd

In [30]:
# Drop nulls 
new_df = new_df.dropna()

# Print to check
print(new_df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 1294 entries, 1 to 1294
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   Date               1294 non-null   datetime64[ns]
 1   NVDA               1294 non-null   float64       
 2   000660.KS_pvd      1294 non-null   float64       
 3   005930.KS_pvd      1294 non-null   float64       
 4   AMD_pvd            1294 non-null   float64       
 5   AMZN_pvd           1294 non-null   float64       
 6   ASML_pvd           1294 non-null   float64       
 7   GOOG_pvd           1294 non-null   float64       
 8   MSFT_pvd           1294 non-null   float64       
 9   MU_pvd             1294 non-null   float64       
 10  NVDA_pvd           1294 non-null   float64       
 11  TSM_pvd            1294 non-null   float64       
 12  ^GSPC_pvd          1294 non-null   float64       
 13  ^IXIC_pvd          1294 non-null   float64       
 14  ^SOX_pvd     

In [31]:
# Check data statistics before Winsorisation
print(new_df.describe())

                                Date         NVDA  000660.KS_pvd  \
count                           1294  1294.000000    1294.000000   
mean   2016-11-09 02:38:01.298299648    26.173167   49387.879573   
min              2014-05-13 00:00:00     4.013763   23050.371094   
25%              2015-08-12 06:00:00     5.486200   35978.044922   
50%              2016-11-09 12:00:00    17.717702   42553.990234   
75%              2018-02-07 18:00:00    44.246037   67032.117188   
max              2019-05-09 00:00:00    71.702995   87629.101562   
std                              NaN    21.290309   17993.582321   

       005930.KS_pvd      AMD_pvd     AMZN_pvd     ASML_pvd     GOOG_pvd  \
count    1294.000000  1294.000000  1294.000000  1294.000000  1294.000000   
mean    29993.246168     9.605951    46.023384   123.553482    41.442461   
min     17137.892578     1.620000    14.347500    70.987434    24.560070   
25%     20906.735840     2.760000    25.194000    90.091259    31.122125   
50%    

In [32]:
# Create list of columns to Winsorise
columns_win = ['NVDA', '50D SMA_pvd', '200D SMA_pvd',
               '000660.KS_pvd', 'AMD_pvd', 'MU_pvd',
               'NVDA_pvd']

# Apply Winsorisation
new_df[columns_win] = new_df[columns_win].apply(lambda x: winsorize(x, limits=[0, 0.03]))

# Print to check
print(new_df.describe())

                                Date         NVDA  000660.KS_pvd  \
count                           1294  1294.000000    1294.000000   
mean   2016-11-09 02:38:01.298299648    26.111841   49316.112638   
min              2014-05-13 00:00:00     4.013763   23050.371094   
25%              2015-08-12 06:00:00     5.486200   35978.044922   
50%              2016-11-09 12:00:00    17.717702   42553.990234   
75%              2018-02-07 18:00:00    44.246037   67032.117188   
max              2019-05-09 00:00:00    65.593605   80548.898438   
std                              NaN    21.171284   17859.992322   

       005930.KS_pvd      AMD_pvd     AMZN_pvd     ASML_pvd     GOOG_pvd  \
count    1294.000000  1294.000000  1294.000000  1294.000000  1294.000000   
mean    29993.246168     9.548029    46.023384   123.553482    41.442461   
min     17137.892578     1.620000    14.347500    70.987434    24.560070   
25%     20906.735840     2.760000    25.194000    90.091259    31.122125   
50%    

In [33]:
# Export new_df to csv
new_df.to_csv('pastdata.csv')