# Scraping discrete stocks and crypto currency trading data

### importing the libraries

In [39]:
import pandas as pd
import numpy as np

from pandas_datareader import data as pdr
from selenium import webdriver
from cryptocmd import CmcScraper

import requests
from bs4 import BeautifulSoup

from time import sleep
from random import randint
import time

import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date
import holidays

%config InlineBackend.figure_format = 'retina'

In [2]:
#NASDAQ
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
options.add_argument('--headless')
driver = webdriver.Chrome(options=options)

### function to get stocks data from data reader [from Yahoo Finance using Pandas DataReader]

In [123]:
def stock_returns(ticker_list, start, end):
    list_of_df = []
    not_found = []
    
    for ticker in ticker_list:
        try:
            df= pdr.get_data_yahoo(ticker, start, end)
            df = df.reset_index()
            df.rename(columns={'Date':'date'})
            df['ticker'] = [ticker] * len(df)
            list_of_df.append(df)
        except ValueError:
            print("Not Found: "+ticker)
            not_found.append(ticker)
    
    return list_of_df, not_found

### function to scrape historical stocks data from NASDAQ

In [4]:
def scrape_nasdaq(ticker_list):
    tckr_data = {}
    count = 1
    for ticker in ticker_list:
        print("Scraping count: " + str(count))
        
        url = 'https://old.nasdaq.com/symbol/'+ticker+'/historical'
        driver.get(url)
        
        # Choose 10 year data from a drop down
        data_range = driver.find_elements_by_name('ddlTimeFrame')
        
        if len(data_range)>0:
            for option in data_range[0].find_elements_by_tag_name('option'):
                if option.text == '10 Years':
                    option.click()
                    break
            time.sleep(5)
            page_source = driver.page_source
    
            #Generating a soup object to parse data from the URL
            soup = BeautifulSoup(page_source, 'lxml')
    
            #Searching for historical table
            tags = soup.find_all('div', id="historicalContainer")
    
            #Getting the data
            temp_data=[]
    
            for tag in tags:
                rows = tag.findAll('tr')
                for tr in rows:
                    cols=tr.findAll('td')
                    val = [tr.text for tr in cols]
                    temp_data.append(val)
            
            #Cleaning the data
            for i in range(len(temp_data)):
                to_process = temp_data[i]
    
                for i in range(len(to_process)):
                    temp = to_process[i]
                    temp = temp.strip()
                    to_process[i] = temp
         
            #Getting the data
            temp_data = temp_data[2:]
            
            if tckr_data.get(ticker) == None:
                tckr_data[ticker] = temp_data
        
            # Time Pause to prevent blocking
            print("Random Sleep")
            sleep(randint(2, 4))
            count+=1
        
        else:
            print("Not Found" + ticker)
        
    #Generating the frame
    date = []
    Open = []
    high = []
    low = []
    close = []
    volume = []
    ticker = []

    for key, value in tckr_data.items():
        for data in value:
            date.append(data[0].replace('/','-'))
            Open.append(data[1])
            high.append(data[2])
            low.append(data[3])
            close.append(data[4])
            volume.append(data[5].replace(',',''))
            ticker.append(key)  
            
    final_dict = {'date' : date, 'Open':Open, 'High':high, 'Low':low, 'Close':close, 'Volume':volume, 'ticker':ticker}
    
    df = pd.DataFrame(final_dict)
    cols = ['date', 'Open', 'High', 'Low', 'Close', 'Volume', 'ticker']
    df = df[cols]

    return df

### function to get stocks data from a combination of above two functions

In [5]:
def get_stocks_data(ticker_set, start, end):
    #Stocks data for a year from Yahoo Data
    stocks_1, not_found = stock_returns(ticker_set, start, end)
    
    if len(not_found) > 0:
    
        #Stocks_data for remaining tickers from Nasdaq
        stocks = scrape_nasdaq(not_found)
    
        #Merging them into one dataframe
        stocks_1 = pd.concat(stocks_1)

        stocks_1 = stocks_1.rename(columns={'Date' : 'date'})
        stocks_1 = stocks_1[['date', 'Open', 'High', 'Low', 'Close', 'Volume', 'ticker']]
        stocks_1['date'] = stocks_1['date'].dt.date

        stocks = stocks.rename(columns={'Ticker' : 'ticker'})
        stocks[['Open', 'High', 'Low', 'Close', 'Volume']] = stocks[['Open', 'High', 'Low', 'Close', 'Volume']].astype(float)
    
        stocks_final = pd.concat([stocks_1, stocks])
        return stocks_final
    
    else:
        return pd.concat(stocks_1)


### function to get Crypto Currency trading data

In [6]:
crypto = ['BTC', 'ETH', 'LTC', 'XEM', 'XMR', 'XRP']

In [7]:
def crypto_scrape(ticker):
    # initialise scraper without time interval to get 
    scraper = CmcScraper(ticker)
    # Pandas dataFrame for the data
    df = scraper.get_dataframe()
    # Adding ticker value to the historical data
    df['crypto_ticker'] = ticker
    
    return df

In [8]:
crypto_data = []
for ticker in crypto:
    df = crypto_scrape(ticker)
    crypto_data.append(df)

In [9]:
crypto_data = pd.concat(crypto_data)

In [10]:
crypto_data.to_csv('Crypto_Data.csv')

### Generate Data [ getting the Open, Low, High and Close values +Ticker ]

In [11]:
crypto = pd.read_csv('Crypto_Data.csv')

In [13]:
crypto.sort_values('Date', ascending=True).drop_duplicates('crypto_ticker')

Unnamed: 0.1,Unnamed: 0,Date,Open,High,Low,Close,Volume,Market Cap,crypto_ticker
6203,2344,2013-04-28,4.3,4.4,4.18,4.35,,74636938,LTC
2344,2344,2013-04-28,135.3,135.98,132.1,134.21,,1488566728,BTC
12048,2246,2013-08-04,0.005874,0.005927,0.005874,0.005882,,45983577,XRP
9801,1955,2014-05-21,2.47,2.65,1.23,1.6,246540.0,1382563,XMR
7845,1641,2015-04-01,0.0004,0.000458,0.00017,0.000242,65525.0,2180943,XEM
3858,1513,2015-08-07,2.83,3.54,2.52,2.77,164329.0,166610555,ETH


In [14]:
crypto = crypto[['Date', 'Open', 'High', 'Low', 'Close', 'crypto_ticker']]

In [15]:
crypto.sample(4)

Unnamed: 0,Date,Open,High,Low,Close,crypto_ticker
11014,2016-06-03,0.005766,0.00587,0.005741,0.005827,XRP
7393,2016-06-26,0.007393,0.007393,0.006326,0.006817,XEM
11063,2016-04-15,0.006303,0.006473,0.006229,0.006424,XRP
8650,2017-07-16,31.97,32.53,28.83,29.43,XMR


In [16]:
tickers = ['AAPL', 'XOM' ,'VMC', 'BA', 'AMZN', 'TGT', 'WMT', 'KO', 'UNH', 'JPM', 'GOOGL', 'STT', 'MSFT', 'VZ', 'XEL', 'SPG']
df = get_stocks_data(tickers, start='2005-01-01', end='2019-08-01')

In [17]:
df = df[['Date', 'Open', 'High', 'Low', 'Close', 'ticker']]

In [18]:
df.sample(5)

Unnamed: 0,Date,Open,High,Low,Close,ticker
363,2006-06-13,21.59,21.625,21.379999,21.455,KO
2838,2016-04-13,81.110001,81.669998,80.489998,81.559998,TGT
2094,2013-04-30,249.369995,254.679993,248.559998,253.809998,AMZN
1670,2011-08-19,50.029999,50.91,49.759998,49.919998,TGT
3021,2017-01-03,62.790001,62.84,62.130001,62.580002,MSFT


In [19]:
df.to_csv('stocks.csv')