# Data Collection Testing

### Grab the api key for NASDAQ

In [2]:
import os
from dotenv import load_dotenv # if missing this module, simply run `pip install python-dotenv`

load_dotenv()
API_KEY = os.getenv('NASDAQ_API_KEY')

### Import the relevant modules and packages

In [3]:
import requests

### Take a quick look at the data to see its structure

In [4]:
# Specify the url and paramaters
# Taking the data source from the data wrangling miniproject
# https://github.com/rubenren/mec-mini-projects
url = 'https://data.nasdaq.com/api/v3/datasets/FSE/AFX_X/data.json'
params = {'api_key': API_KEY}

# Create the response object
r = requests.get(url, params=params)

dataObject = dict(r.json())

In [9]:
# Inspecting the jason object we got
print(dataObject.keys())
print(type(dataObject['dataset_data']))
print(dataObject['dataset_data'].keys())

print()
for item in dataObject['dataset_data'].keys():
    print(item + ':', type(dataObject['dataset_data'][item]))

# Probably want to look at everything other than data for now
print()
for item in dataObject['dataset_data'].keys() - ['data']:
    print(item + ':', dataObject['dataset_data'][item])

dict_keys(['dataset_data'])
<class 'dict'>
dict_keys(['limit', 'transform', 'column_index', 'column_names', 'start_date', 'end_date', 'frequency', 'data', 'collapse', 'order'])

limit: <class 'NoneType'>
transform: <class 'NoneType'>
column_index: <class 'NoneType'>
column_names: <class 'list'>
start_date: <class 'str'>
end_date: <class 'str'>
frequency: <class 'str'>
data: <class 'list'>
collapse: <class 'NoneType'>
order: <class 'NoneType'>

collapse: None
limit: None
order: None
column_names: ['Date', 'Open', 'High', 'Low', 'Close', 'Change', 'Traded Volume', 'Turnover', 'Last Price of the Day', 'Daily Traded Units', 'Daily Turnover']
column_index: None
transform: None
end_date: 2020-12-01
frequency: daily
start_date: 2000-06-07


In [16]:
# Now we can take a peek at the raw data
print(type(dataObject['dataset_data']['data']))
print(len(dataObject['dataset_data']['data']))
print(dataObject['dataset_data']['column_names'])
print(dataObject['dataset_data']['data'][0])
print(dataObject['dataset_data']['data'][1])

<class 'list'>
5268
['Date', 'Open', 'High', 'Low', 'Close', 'Change', 'Traded Volume', 'Turnover', 'Last Price of the Day', 'Daily Traded Units', 'Daily Turnover']
['2020-12-01', 112.2, 112.2, 111.5, 112.0, None, 51.0, 5703.0, None, None, None]
['2020-11-30', 111.0, 113.6, 111.0, 112.1, None, 315.0, 35111.5, None, None, None]


## Save the data into a json file

Define the variables, and imports first

In [18]:
import json

file_name = 'data/FSE_AFX_X_2000-2020.json'

In [20]:
# Write the data to the file.
with open(file_name, 'w') as outfile:
    json.dump(dataObject, outfile)

# Collecting Data 

### Starting out with historical stock data

Saw a blog on algotrading101.com that recommended using the yahoo_fin library for python. 

I will consider using this library as it makes the process of data retrieval and formatting orders of magnitude easier.
Seems to be scraping the prices

In [1]:
# need to install it first
!pip install yahoo_fin
!pip install requests_html

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting yahoo_fin
  Downloading yahoo_fin-0.8.9.1-py3-none-any.whl (10 kB)
Collecting requests-html
  Downloading requests_html-0.10.0-py3-none-any.whl (13 kB)
Collecting w3lib
  Downloading w3lib-1.22.0-py2.py3-none-any.whl (20 kB)
Collecting parse
  Downloading parse-1.19.0.tar.gz (30 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting fake-useragent
  Downloading fake-useragent-0.1.11.tar.gz (13 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting pyppeteer>=0.0.14
  Downloading pyppeteer-1.0.2-py3-none-any.whl (83 kB)
     ---------------------------------------- 83.4/83.4 KB 4.9 MB/s eta 0:00:00
Collecting pyquery
  Downloading pyquery-1.4.3-py3-none-any.whl (22 kB)
Collecting websockets<11.0,>=10.0
  Downloading websockets-10.2-cp37-cp37m-win_amd64.whl (97 kB)
     -------------

In [4]:
import os
import json
import requests
import pandas
from yahoo_fin.stock_info import get_data
from dotenv import load_dotenv # Now I can grab my keys


Might be useful to note that this is planning to use only their own data for evaluation, might want to look into how to enable the machine to consider either a wider range or the entire market

In [29]:
# List of tickers I want to track and predict
ticker_list = ["amzn", "aapl", "btc-usd", "spy"]

# Variables for time interval settings
start_date = "1/1/2000"
end_date = "12/31/2020"
interval = "1d"

# Dictionary to store the data in
historical_data = {}

file_name = 'data/AMZN_AAPL_BTCUSD_SPY.json'

In [11]:
# now we iterate through all the tickers and grab the data
for ticker in ticker_list:
    historical_data[ticker] = get_data(ticker, start_date=start_date, end_date=end_date, interval=interval, index_as_date=False)

historical_data['btc-usd']

Unnamed: 0,date,open,high,low,close,adjclose,volume,ticker
0,2014-09-17,465.864014,468.174011,452.421997,457.334015,457.334015,21056800,BTC-USD
1,2014-09-18,456.859985,456.859985,413.104004,424.440002,424.440002,34483200,BTC-USD
2,2014-09-19,424.102997,427.834991,384.532013,394.795990,394.795990,37919700,BTC-USD
3,2014-09-20,394.673004,423.295990,389.882996,408.903992,408.903992,36863600,BTC-USD
4,2014-09-21,408.084991,412.425995,393.181000,398.821014,398.821014,26580100,BTC-USD
...,...,...,...,...,...,...,...,...
2293,2020-12-27,26439.373047,28288.839844,25922.769531,26272.294922,26272.294922,66479895605,BTC-USD
2294,2020-12-28,26280.822266,27389.111328,26207.640625,27084.808594,27084.808594,49056742893,BTC-USD
2295,2020-12-29,27081.810547,27370.720703,25987.298828,27362.437500,27362.437500,45265946774,BTC-USD
2296,2020-12-30,27360.089844,28937.740234,27360.089844,28840.953125,28840.953125,51287442704,BTC-USD


In [21]:
historical_data['btc-usd'].to_json(orient='index')[:200]

'{"0":{"date":1410912000000,"open":465.8640136719,"high":468.1740112305,"low":452.4219970703,"close":457.3340148926,"adjclose":457.3340148926,"volume":21056800,"ticker":"BTC-USD"},"1":{"date":141099840'

In [24]:
test_dict = json.loads(historical_data['btc-usd'].to_json(orient='index'))
len(test_dict)

2298

In [27]:
test_dict['0']

{'date': 1410912000000,
 'open': 465.8640136719,
 'high': 468.1740112305,
 'low': 452.4219970703,
 'close': 457.3340148926,
 'adjclose': 457.3340148926,
 'volume': 21056800,
 'ticker': 'BTC-USD'}

In [28]:
json_data = {}

# convert all of the data frame objects to json objects for storage
for ticker in ticker_list:
    json_data[ticker] = json.loads(historical_data[ticker].to_json(orient='index'))

In [30]:
# now for the storing part
with open(file_name, 'w') as outfile:
    json.dump(json_data, outfile)

I'm now realizing that the data I have is probably not enough if I'm doing this alone, I might want to consider looking at a much wider scope to have more data. This might have issues of its own, such as the data being relevant

I just don't think 3-4 MB of data is going to cut it

### Now for the social media data

I will be using twitter api alone first here and see how much data I can get from that, I will go on later and examine other sites like reddit or yahoo.

In [73]:
load_dotenv()
bearer_token = os.getenv("TWTTR_BEAR_TKN")

search_url = "https://api.twitter.com/2/tweets/search/recent"

query_params = {'query': '(bitcoin OR #BTC OR #bitcoin) -is:retweet -"sign up" -"for sale" -"No referral" -shop -click lang:en', 'tweet.fields': 'entities'}

def bearer_oauth(r):
    """
    Methos required by bearer token authentication.
    """
    
    r.headers["Authorization"] = f"Bearer {bearer_token}"
    r.headers["User-Agent"] = "v2DataCollectionPython"
    return r

def connect_to_endpoint(url, params):
    response = requests.get(url, auth=bearer_oauth, params=params)
    print("Response code:", response.status_code)
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()



In [74]:
json_response = connect_to_endpoint(search_url, query_params)
print(json.dumps(json_response, indent=4, sort_keys=True))

Response code: 200
{
    "data": [
        {
            "entities": {
                "mentions": [
                    {
                        "end": 14,
                        "id": "455937214",
                        "start": 0,
                        "username": "LayahHeilpern"
                    },
                    {
                        "end": 27,
                        "id": "56562803",
                        "start": 15,
                        "username": "PeterSchiff"
                    }
                ]
            },
            "id": "1498462702162915331",
            "text": "@LayahHeilpern @PeterSchiff One more question. How many people would take their paycheck in Bitcoin?"
        },
        {
            "entities": {
                "urls": [
                    {
                        "display_url": "pic.twitter.com/LyhEKBkDSi",
                        "end": 131,
                        "expanded_url": "https://twitter.com/InteractiveZ0ne/status

In [76]:
len(json_response['data'])

10

In [77]:
# Create a list to store the accumulated results
all_tweets = []

for count in range(100):
    json_response = connect_to_endpoint(search_url, query_params)
    all_tweets += json_response['data']
    if json_response['meta']['next_token'] == None:
        break
    else:
        query_params['next_token'] = json_response['meta']['next_token']
        
print(count, "pages were processed")
    

Response code: 200
Response code: 200
Response code: 200
Response code: 200
Response code: 200
Response code: 200
Response code: 200
Response code: 200
Response code: 200
Response code: 200
Response code: 200
Response code: 200
Response code: 200
Response code: 200
Response code: 200
Response code: 200
Response code: 200
Response code: 200
Response code: 200
Response code: 200
Response code: 200
Response code: 200
Response code: 200
Response code: 200
Response code: 200
Response code: 200
Response code: 200
Response code: 200
Response code: 200
Response code: 200
Response code: 200
Response code: 200
Response code: 200
Response code: 200
Response code: 200
Response code: 200
Response code: 200
Response code: 200
Response code: 200
Response code: 200
Response code: 200
Response code: 200
Response code: 200
Response code: 200
Response code: 200
Response code: 200
Response code: 200
Response code: 200
Response code: 200
Response code: 200
Response code: 200
Response code: 200
Response cod

Now for placing all the tweets in a file

In [82]:
filename = "data/BTC_tweets.json"
json_data = {'data': all_tweets}

with open(filename, 'w') as outfile:
    json.dump(json_data, outfile)

Before I begin collecting tweets for other companies, I need to solve some of my questions about this data first:

 - How much data do I need to build an effective model evaluation?
 - I am limited by number of tweets I can obtain, I need to pick them carefully. How can I ensure that I have quality data (i.e. no random advertisements)
 - will I have a similar number of tweets for other companies, and if I do not, how will I handle that?
 - what kinds of problems in the data can I solve through model design, instead of data engineering?