## Preamble

In [9]:
import numpy as np
import os
import requests
from ediblepickle import pickle
from bs4 import BeautifulSoup
import pandas as pd
import csv
import sys
import subprocess
from urllib.parse import quote
from retrying import retry
from time import sleep
from ediblepickle import checkpoint

In [2]:
API_key = "MY2P7WF6CWPVBE7O"

In [3]:
cache_dir = 'cache'
if not os.path.exists(cache_dir):
    os.mkdir(cache_dir)

In [4]:
@checkpoint(key=lambda args, kwargs: quote(args[0]+'_'+args[1]) + '.pkl', work_dir=cache_dir)
@retry
def load_data(symbol,month):
    CSV_URL = 'https://www.alphavantage.co/query?function=TIME_SERIES_INTRADAY_EXTENDED&symbol='+\
    symbol+'&interval=1min&slice='+month+'&apikey='+API_key
    with requests.Session() as s:
        download = s.get(CSV_URL)
        decoded_content = download.content.decode('utf-8')
        cr = csv.reader(decoded_content.splitlines(), delimiter=',')
        data = list(cr)
        df = pd.DataFrame(data[1:],columns=data[0])
        if len(data) < 10:
            raise IOError("Failed attempt")
        else:
            return df

In [5]:
df = pd.read_csv('/Users/josht/Documents/GitHub/erdos_twitter_project/data/Stock_indices/snp500_list.csv')
tickers = df.Symbol

## Data Extraction

In [14]:
tickers[250: 376]

250    INTC
251     ICE
252     IBM
253      IP
254     IPG
       ... 
371     PFE
372      PM
373     PSX
374     PNW
375     PXD
Name: Symbol, Length: 126, dtype: object

Let's try aggregating the first one: `INTC`

In [36]:
df = load_data(tickers[250], "year1month1")

In [37]:
for j in [1,2]:
    for string in [f'year{j}month{k}' for k in range(3-j,13)]:
        df2 = load_data(tickers[250], string)
        df = pd.concat([df, df2], ignore_index=True)

In [38]:
df.to_csv("/Users/josht/Documents/GitHub/erdos_twitter_project/Stock_data_get/Stock_data_250_375/" + tickers[250] + ".csv")


Now, let's try on tickers 251 - 260

In [11]:
for i in range(251, 261):
    df = load_data(tickers[i], "year1month1")
    for j in [1,2]:
        kk = 3 - j
        for string in [f'year{j}month{k}' for k in range(3-j,13)]:
            df2 = load_data(tickers[i], string)
            df = pd.concat([df, df2], ignore_index=True)
            if kk % 4 == 0:
                print(tickers[i] + " " + string + " finished")
            kk += 1
    df.to_csv("/Users/josht/Documents/GitHub/erdos_twitter_project/Stock_data_get/Stock_data_250_375/" + tickers[i] + ".csv")
    
    

ICE year1month4 finished
ICE year1month8 finished
ICE year1month12 finished
ICE year2month4 finished
ICE year2month8 finished
ICE year2month12 finished
IBM year1month4 finished
IBM year1month8 finished
IBM year1month12 finished
IBM year2month4 finished
IBM year2month8 finished
IBM year2month12 finished
IP year1month4 finished
IP year1month8 finished
IP year1month12 finished
IP year2month4 finished
IP year2month8 finished
IP year2month12 finished
IPG year1month4 finished
IPG year1month8 finished
IPG year1month12 finished
IPG year2month4 finished
IPG year2month8 finished
IPG year2month12 finished
IFF year1month4 finished
IFF year1month8 finished
IFF year1month12 finished
IFF year2month4 finished
IFF year2month8 finished
IFF year2month12 finished
INTU year1month4 finished
INTU year1month8 finished
INTU year1month12 finished
INTU year2month4 finished
INTU year2month8 finished
INTU year2month12 finished
ISRG year1month4 finished
ISRG year1month8 finished
ISRG year1month12 finished
ISRG year

They're huge. Let's turn them into parquet files. 

In [12]:
for i in range(250, 261):
    df = pd.read_csv("/Users/josht/Documents/GitHub/erdos_twitter_project/Stock_data_get/Stock_data_250_375/" + tickers[i] + ".csv")
    df.to_parquet("/Users/josht/Documents/GitHub/erdos_twitter_project/Stock_data_get/Stock_data_250_375/" + tickers[i] + ".parquet")
    

Now, let's do the rest, also saving the tables as parquet files. 

In [7]:
ii = 1
for i in range(360, 375):
    df = load_data(tickers[i], "year1month1")
    for j in [1,2]:
        for string in [f'year{j}month{k}' for k in range(3-j,13)]:
            df2 = load_data(tickers[i], string)
            df = pd.concat([df, df2], ignore_index=True)
    df.to_parquet("/Users/josht/Documents/Stock_data_250_375/" + tickers[i] + ".parquet")
    print(str(ii) + ". " + tickers[i] + " finished")
    ii += 1


1. PCAR finished
2. PKG finished
3. PH finished
4. PAYX finished
5. PAYC finished
6. PYPL finished
7. PENN finished
8. PNR finished
9. PBCT finished
10. PEP finished
11. PKI finished
12. PFE finished
13. PM finished
14. PSX finished
15. PNW finished


In [6]:
tickers[358]

'OGN'

In [6]:
tickers[359]

'OTIS'

In [8]:
tickers[374]

'PNW'

In [20]:
Problematic_indices = [[270, "KSU"], [358, "OGN"], [359, "OTIS"]]
print(Problematic_indices)

[[270, 'KSU'], [358, 'OGN'], [359, 'OTIS']]


## Alternative Extraction

In [16]:
df = pd.read_csv("/Users/josht/Downloads/OTIS1.csv")

In [17]:
for i in range(2, 21):
    df2 = pd.read_csv("/Users/josht/Downloads/OTIS" + str(i) + ".csv")
    df = pd.concat([df, df2], ignore_index = True)

In [18]:
df

Unnamed: 0,time,open,high,low,close,volume
0,2021-11-19 16:03:00,86.260000,86.260000,86.260000,86.260000,15603
1,2021-11-19 16:02:00,86.260000,86.260000,86.260000,86.260000,189
2,2021-11-19 16:00:00,86.130000,86.280000,86.120000,86.250000,85685
3,2021-11-19 15:59:00,86.145000,86.150000,86.110000,86.130000,21897
4,2021-11-19 15:58:00,86.100000,86.190000,86.075000,86.145000,22086
...,...,...,...,...,...,...
161178,2020-04-03 08:38:00,42.663714,42.663714,42.663714,42.663714,175
161179,2020-04-03 08:29:00,42.174452,42.174452,42.174452,42.174452,389
161180,2020-04-03 08:22:00,41.518840,41.518840,41.518840,41.518840,400
161181,2020-04-03 08:15:00,42.017888,42.017888,42.017888,42.017888,100


Could it be that alphavantage only has the data for this index back to Sep 18, 2020? The index has existed for a longer time, but after several tries to extract it manually, noting comes out.

In [19]:
df.to_parquet("/Users/josht/Documents/Stock_data_250_375/OTIS.parquet")

## Scratch Work

In [2]:
API_key = "CSBIGB32V8U4NLBV"

In [5]:
cmd = subprocess.Popen('pwd', stdout=subprocess.PIPE)
cmd_out, cmd_err = cmd.communicate()
local_path = os.fsdecode(cmd_out).strip()

In [6]:
local_path

'/Users/josht/Documents/GitHub/erdos_twitter_project/Stock_data_get'

In [3]:
cache_dir = 'cache'
if not os.path.exists(cache_dir):
    os.mkdir(cache_dir)

In [4]:
os.path.exists(cache_dir)

True

In [5]:
@checkpoint(key=lambda args, kwargs: quote(args[0]+'_'+args[1]) + '.pkl', work_dir=cache_dir)
@retry
def load_data(symbol,month):
    CSV_URL = 'https://www.alphavantage.co/query?function=TIME_SERIES_INTRADAY_EXTENDED&symbol='+\
    symbol+'&interval=1min&slice='+month+'&apikey='+API_key
    with requests.Session() as s:
        download = s.get(CSV_URL)
        decoded_content = download.content.decode('utf-8')
        cr = csv.reader(decoded_content.splitlines(), delimiter=',')
        data = list(cr)
        df = pd.DataFrame(data[1:],columns=data[0])
        if len(data) < 10:
            raise IOError("Failed attempt")
        else:
            return df

In [6]:
df = pd.read_csv('/Users/josht/Documents/GitHub/erdos_twitter_project/data/Stock_indices/snp500_list.csv')
tickers = df.Symbol

In [14]:
tickers[250]

'INTC'

In [15]:
directory = os.fsencode(local_path+'/cache/')

In [16]:
directory

b'/Users/josht/Documents/GitHub/erdos_twitter_project/Stock_data_get/cache/'

In [17]:
def aggregate(symbol):
    df = pd.read_pickle(local_path+'/cache/'+symbol+'_year1month2.pkl')
    for file in os.listdir(directory):
        filename = os.fsdecode(file)
        if filename.startswith(f'{symbol}_') and filename.endswith('.pkl'):
            if '_year1month2.pkl' in filename : continue  # we need to skip this file to avoid double counting
            df = pd.concat([df,pd.read_pickle(local_path+'/cache/'+filename)])
            #print(len(df),filename) 
            # uncomment the above line if you want some output just so that you know the code is running as expected
            os.remove(local_path+'/cache/'+filename)
            
        df.to_csv(local_path+'/cache/'+f'{symbol}.csv',index=False)
    os.remove(local_path+"cache/"+symbol+'_year1month2.pkl') 
    return None

In [18]:
os.listdir(directory)

[]

In [19]:
pd.read_pickle(local_path+'/cache/INTC_year1month2.pkl')

FileNotFoundError: [Errno 2] No such file or directory: '/Users/josht/Documents/GitHub/erdos_twitter_project/Stock_data_get/cache/INTC_year1month2.pkl'

In [22]:
for ticker in tickers[:1]:  #set the ticker range here
    for j in [1,2]:
        for string in [f'year{j}month{k}' for k in range(1,13)]:
            load_data(ticker,string)
            sleep(1)
            
    aggregate(ticker)

KeyboardInterrupt: 

In [23]:
sleep(1)

In [24]:
for ticker in tickers[0:1]:  #set the ticker range here
    for j in [1,2]:
        for string in [f'year{j}month{k}' for k in range(1,13)]:
            print(string)

year1month1
year1month2
year1month3
year1month4
year1month5
year1month6
year1month7
year1month8
year1month9
year1month10
year1month11
year1month12
year2month1
year2month2
year2month3
year2month4
year2month5
year2month6
year2month7
year2month8
year2month9
year2month10
year2month11
year2month12


In [26]:
type(string)

str

In [27]:
df_test = load_data("TSLA", "year1month1")

In [28]:
df_test

Unnamed: 0,time,open,high,low,close,volume
0,2021-11-17 20:00:00,1087.75,1088.0,1087.6,1087.9999,2581
1,2021-11-17 19:59:00,1087.45,1087.96,1087.45,1087.6001,1407
2,2021-11-17 19:58:00,1087.0,1087.47,1087.0,1087.47,1160
3,2021-11-17 19:55:00,1086.79,1086.79,1086.79,1086.79,762
4,2021-11-17 19:54:00,1087.35,1087.35,1087.35,1087.35,675
...,...,...,...,...,...,...
17465,2021-10-19 04:05:00,875.7,876.0,875.5,875.5,2210
17466,2021-10-19 04:04:00,875.61,876.0,875.61,876.0,1764
17467,2021-10-19 04:03:00,875.8,875.8,875.8,875.8,1602
17468,2021-10-19 04:02:00,875.7,875.7,875.7,875.7,1110


In [30]:
df_test.to_csv("/Users/josht/Documents/GitHub/erdos_twitter_project/Stock_data_get/tesla_test.csv")