<a href="https://colab.research.google.com/github/thevipulsharma/bondai/blob/master/test/Test_Dataset_Framework.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Test_Dataset_Framework

### Data
1. Company Financials
2. Company Stock Prices

In [0]:
# DB Files for S&P 500 Companies
from google.colab import drive

# Files to process
do_file = "/content/gdrive/My Drive/Colab Notebooks/Data/Bondai/Test/do_file.txt"
done_file = "/content/gdrive/My Drive/Colab Notebooks/Data/Bondai/Test/done_file.txt"

In [0]:
# Reading the files
import pandas as pd
do_df = pd.read_csv("/content/gdrive/My Drive/Colab Notebooks/Data/Bondai/Test/do_file.txt", header=None, names=["Tickers"])
done_df = pd.read_csv("/content/gdrive/My Drive/Colab Notebooks/Data/Bondai/Test/done_file.txt", header=None, names=["Tickers"])

In [0]:
do_set = set(do_df["Tickers"].tolist())
done_set = set(done_df["Tickers"].tolist())

In [0]:
# URL Paths for Stockrow Website

stockrow_url_paths = {
    'company': 'https://stockrow.com/api/companies/',
    'annual': {
        'income-statement': '/financials.xlsx?dimension=MRY&section=Income%20Statement&sort=desc',
        'balance-sheet': '/financials.xlsx?dimension=MRY&section=Balance%20Sheet&sort=desc',
        'cashflow-statement': '/financials.xlsx?dimension=MRY&section=Cash%20Flow&sort=desc',
        'metrics': '/financials.xlsx?dimension=MRY&section=Metrics&sort=desc',
        'growth': '/financials.xlsx?dimension=MRY&section=Growth&sort=desc'
    } 
}

In [0]:
# Stockrow Downloader
import requests
def stockrow_download(ticker):
    income_statement = pd.read_excel(stockrow_url_paths['company'] + ticker + stockrow_url_paths['annual']["income-statement"], engine="xlrd")
    balance_sheet = pd.read_excel(stockrow_url_paths['company'] + ticker + stockrow_url_paths['annual']["balance-sheet"], engine="xlrd")
    cashflow_statement = pd.read_excel(stockrow_url_paths['company'] + ticker + stockrow_url_paths['annual']["cashflow-statement"], engine="xlrd")
    metrics = pd.read_excel(stockrow_url_paths['company'] + ticker + stockrow_url_paths['annual']["metrics"], engine="xlrd")
    growth = pd.read_excel(stockrow_url_paths['company'] + ticker + stockrow_url_paths['annual']["growth"], engine="xlrd")
    return income_statement, balance_sheet, cashflow_statement, metrics, growth

In [0]:
# Modified Get Yahoo Quotes Script by Brad Luicas

__author__ = "Brad Luicas"
__copyright__ = "Copyright 2017, Brad Lucas"
__license__ = "MIT"
__version__ = "1.0.0"
__maintainer__ = "Brad Lucas"
__email__ = "brad@beaconhill.com"
__status__ = "Production"

import re
import sys
import time
import datetime
# import requests


def split_crumb_store(v):
    return v.split(':')[2].strip('"')


def find_crumb_store(lines):
    # Looking for
    # ,"CrumbStore":{"crumb":"9q.A4D1c.b9
    for l in lines:
        if re.findall(r'CrumbStore', l):
            return l
    print("Did not find CrumbStore")


def get_cookie_value(r):
    return {'B': r.cookies['B']}


def get_page_data(symbol):
    url = "https://finance.yahoo.com/quote/%s/?p=%s" % (symbol, symbol)
    r = requests.get(url)
    cookie = get_cookie_value(r)

    # Code to replace possible \u002F value
    # ,"CrumbStore":{"crumb":"FWP\u002F5EFll3U"
    # FWP\u002F5EFll3U
    lines = r.content.decode('unicode-escape').strip(). replace('}', '\n')
    return cookie, lines.split('\n')


def get_cookie_crumb(symbol):
    cookie, lines = get_page_data(symbol)
    crumb = split_crumb_store(find_crumb_store(lines))
    return cookie, crumb


def get_data(symbol, start_date, end_date, cookie, crumb):
    # filename = '%s.csv' % (symbol)
    url = "https://query1.finance.yahoo.com/v7/finance/download/%s?period1=%s&period2=%s&interval=1d&events=history&crumb=%s" % (symbol, start_date, end_date, crumb)
    response = requests.get(url, cookies=cookie)
    # with open (filename, 'wb') as handle:
    #     for block in response.iter_content(1024):
    #         handle.write(block)
    return response


def get_now_epoch():
    # @see https://www.linuxquestions.org/questions/programming-9/python-datetime-to-epoch-4175520007/#post5244109
    return int(time.time())


def download_quotes(symbol):
    start_date = 0
    end_date = get_now_epoch()
    cookie, crumb = get_cookie_crumb(symbol)
    historical_prices = get_data(symbol, start_date, end_date, cookie, crumb)
    return pd.read_csv(io.StringIO(historical_prices.content.decode('utf-8')))

In [0]:
import os
import io
def main():
    for ticker in do_set.copy():
        print("Downloading data for: " + ticker)
        income_statement, balance_sheet, cashflow_statement, metrics, growth = stockrow_download(ticker)
        historical_prices = download_quotes(ticker)
        with pd.ExcelWriter('/content/gdrive/My Drive/Colab Notebooks/Data/Bondai/Test/' + ticker + '.xlsx') as writer:
            historical_prices.to_excel(writer, sheet_name="historical_prices")
            balance_sheet.to_excel(writer, sheet_name="balance_sheet")
            income_statement.to_excel(writer, sheet_name="income_statement")
            cashflow_statement.to_excel(writer, sheet_name="cashflow_statement")
            metrics.to_excel(writer, sheet_name="metrics")
            growth.to_excel(writer, sheet_name="growth")
            
        done_set.add(ticker)
        do_set.discard(ticker)
        pd.DataFrame(list(done_list)).to_csv("/content/gdrive/My Drive/Colab Notebooks/Data/Bondai/Test/do_file.txt", header=None, index=False)
        pd.DataFrame(list(do_list)).to_csv("/content/gdrive/My Drive/Colab Notebooks/Data/Bondai/Test/do_file.txt", header=None, index=False)

In [89]:
main()

Downloading data for: AAPL




Downloading data for: IBM
Downloading data for: MSFT


In [0]:
income_statement, balance_sheet, cashflow_statement, metrics, growth = stockrow_download("AAPL")

In [57]:
# df = pd.ExcelFile("https://stockrow.com/api/companies/AAPL/financials.xlsx?dimension=MRQ&section=Balance%20Sheet&sort=desc", engine="xlrd")
# pd.read_excel(df)
import io
historical_prices = download_quotes("AAPL")
df = pd.read_csv(io.StringIO(historical_prices.content.decode('utf-8')))



Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,1980-12-12,0.513393,0.515625,0.513393,0.513393,0.023007,117258400.0
1,1980-12-15,0.488839,0.488839,0.486607,0.486607,0.021807,43971200.0
2,1980-12-16,0.453125,0.453125,0.450893,0.450893,0.020206,26432000.0
3,1980-12-17,0.462054,0.464286,0.462054,0.462054,0.020706,21610400.0
4,1980-12-18,0.475446,0.477679,0.475446,0.475446,0.021307,18362400.0
5,1980-12-19,0.504464,0.506696,0.504464,0.504464,0.022607,12157600.0
6,1980-12-22,0.529018,0.531250,0.529018,0.529018,0.023707,9340800.0
7,1980-12-23,0.551339,0.553571,0.551339,0.551339,0.024708,11737600.0
8,1980-12-24,0.580357,0.582589,0.580357,0.580357,0.026008,12000800.0
9,1980-12-26,0.633929,0.636161,0.633929,0.633929,0.028409,13893600.0


In [68]:
income_statement

Unnamed: 0,2018-09-29 00:00:00,2017-09-30 00:00:00,2016-09-24 00:00:00,2015-09-26 00:00:00,2014-09-27 00:00:00,2013-09-28 00:00:00,2012-09-29 00:00:00,2011-09-24 00:00:00,2010-09-25 00:00:00,2009-09-26 00:00:00
Revenue,265595000000.0,229234000000.0,215639000000.0,233715000000.0,182795000000.0,170910000000.0,156508000000.0,108249000000.0,65225000000.0,42905000000.0
Revenue Growth,0.1586,0.063,-0.0773,0.2786,0.0695,0.092,0.4458,0.6596,0.5202,0.1444
Cost of Revenue,163756000000.0,141048000000.0,131376000000.0,140089000000.0,112258000000.0,106606000000.0,87846000000.0,64431000000.0,39541000000.0,25683000000.0
Gross Profit,101839000000.0,88186000000.0,84263000000.0,93626000000.0,70537000000.0,64304000000.0,68662000000.0,43818000000.0,25684000000.0,17222000000.0
R&D Expenses,14236000000.0,11581000000.0,10045000000.0,8067000000.0,6041000000.0,4475000000.0,3381000000.0,2429000000.0,1782000000.0,1333000000.0
SG&A Expense,16705000000.0,15261000000.0,14194000000.0,14329000000.0,11993000000.0,10830000000.0,10040000000.0,7599000000.0,5517000000.0,4149000000.0
Operating Expenses,30941000000.0,26842000000.0,24239000000.0,22396000000.0,18034000000.0,15305000000.0,13421000000.0,10028000000.0,7299000000.0,5482000000.0
Operating Income,70898000000.0,61344000000.0,60024000000.0,71230000000.0,52503000000.0,48999000000.0,55241000000.0,33790000000.0,18385000000.0,11740000000.0
Interest Expense,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Earnings before Tax,72903000000.0,64089000000.0,61372000000.0,72515000000.0,53483000000.0,50155000000.0,55763000000.0,34205000000.0,18540000000.0,12066000000.0


In [0]:
with pd.ExcelWriter('/content/gdrive/My Drive/Colab Notebooks/Data/Bondai/Test/multi.xlsx') as writer:
    pd.read_csv(io.StringIO(historical_prices.content.decode('utf-8'))).to_excel(writer, sheet_name="stocks")
    balance_sheet.to_excel(writer, sheet_name="weather")

In [0]:
pd.DataFrame(list(done_list)).to_csv("/content/gdrive/My Drive/Colab Notebooks/Data/Bondai/Test/test.txt", header=None, index=False)