# Get data through scraping

This Jupyter notebook was used to get all the historical data of all the closed-end funds. Only funds with a history of at least 5 years were kept. Please note, the cells were not run in order as there was some major clean-up to do after scraping the data from a website. The final data file was written to '.data/all_cefs.csv'.

In [1]:
import pandas as pd
import requests
from pandas.compat import StringIO, BytesIO
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pyperclip
import csv
import matplotlib.pyplot as plt
import time
import numpy as np

In [2]:
tickers = pd.read_csv('tickers.csv', header=None, names=['ticker'])

In [41]:
# check for non adjusted closes
not_adj = []
for t in tickers.ticker:
    df = pd.read_csv('./data/' + t + '.csv')
    if df.iloc[-1]['Close'] == df.iloc[-1]['Adj Close']:
        not_adj.append(t)

In [52]:
# clean data in csv files
for t in not_adj:
    df = pd.read_csv('./data/' + t + '.csv')
    df.dropna(subset=(['NAV'])).to_csv('./data/' + t + '.csv')

In [60]:
# check for non adjusted closes again
not_adj5 = []
for t in not_adj:
    df = pd.read_csv('./data/' + t + '.csv')
    if df.iloc[-1]['Close'] == df.iloc[-1]['Adj Close']:
        not_adj5.append(t)

In [61]:
np.array(not_adj5)

array(['NRGX', 'OCCI', 'PHYS', 'PNI', 'PSLV', 'SPPP', 'TURN'], dtype='<U4')

In [70]:
df = pd.DataFrame()

for t in tickers.ticker:
    dft = pd.read_csv('./data/'+ t + '.csv', parse_dates=True)
    
    # only append fund if at least 5 years old
    if dft.iloc[-1]['Date'] < '2014-03-02':
        dft['ticker'] = t
        df = df.append(dft)
    
df.to_csv('./data/all_cefs.csv')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [75]:
df.drop(columns=['Unnamed: 0'], inplace=True)

In [78]:
df = df[['ticker', 'Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'NAV', 'Adj NAV', 'Volume']]

In [81]:
df.rename(index=str, columns={'Adj Close': 'Adj_Close', 'Adj NAV': 'Adj_NAV'}, inplace=True)

In [82]:
df.head()

Unnamed: 0,ticker,Date,Open,High,Low,Close,Adj_Close,NAV,Adj_NAV,Volume
0,ACP,2019-03-05,11.86,11.9,11.82,11.89,11.89,12.85,12.85,54156
1,ACP,2019-03-04,11.91,11.95,11.84,11.88,11.88,12.84,12.84,66729
2,ACP,2019-03-01,11.91,11.93,11.85,11.89,11.89,12.88,12.88,80627
3,ACP,2019-02-28,11.81,11.87,11.79,11.87,11.87,12.89,12.89,67968
4,ACP,2019-02-27,11.87,11.9,11.74,11.82,11.82,12.87,12.87,123751


In [83]:
df.to_csv('./data/all_cefs.csv')

In [59]:
# cell to create csv after copying manually
t = pyperclip.paste()
df = pd.read_csv(StringIO(t), sep='\t', header=0, index_col=0)
df.to_csv('./data/TWN.csv')

In [None]:
# scrape the prices for each ticker
# write a csv file for each ticker
driver = webdriver.Chrome()

for ticker in tickers.ticker:
    driver.get('https://cefanalyzer.com/app/quote/' + ticker + '/historical')
    WebDriverWait(driver, 60).until(EC.presence_of_element_located((By.CLASS_NAME, "ReactVirtualized__Table")))
    time.sleep(10)
    driver.find_element_by_xpath("//button[@class='btn btn-default' and text() = ' Copy Prices']").click()
    t = pyperclip.paste()
    df = pd.read_csv(StringIO(t), sep='\t', header=0, index_col=0)
    df.to_csv('./data/' + ticker + '.csv')
driver.quit()

In [6]:
# scrape the distributions for each ticker
# write a csv file of the distribution for each ticker
driver = webdriver.Chrome()

for ticker in tickers.ticker:
    driver.get('https://cefanalyzer.com/app/quote/' + ticker + '/historical')
    WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CLASS_NAME, "ReactVirtualized__Table")))
    driver.find_element_by_xpath("//button[@class='btn btn-default' and text() = ' Copy Distributions']").click()
    t = pyperclip.paste()
    df = pd.read_csv(StringIO(t), sep='\t', header=0, index_col=0)
    df.to_csv('./data/' + ticker + '_d.csv')
driver.quit()

In [9]:
driver.quit()
