In [37]:
import numpy as np
import pandas as pd
import Quandl
import os

Read Quandl data into a Pandas dataframe, tidy up the columns, verify data integrity and stash it in an SQL database.

In [6]:
ticker = "YAHOO/INDEX_GSPC"  # Set target ticker symbol
# In shell, prior to running notebook: export QUANDL_TOKEN="secret-key"
token = os.environ.get('QUANDL_TOKEN')  # Grab the environment variable

In [20]:
df = Quandl.get(ticker, authtoken=token, trim_start='1990-01-01')  # Using the Quandl module

In [23]:
df_devBackup = df.copy()  # For development only, copy the dataframe in case we make a mistake
# df = df_devBackup.copy()  # Restore the original dataframe

In [21]:
df.head()  # Show top of dataframe

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1990-01-02,353.399994,359.690002,351.980011,359.690002,162070000,359.690002
1990-01-03,359.690002,360.589996,357.890015,358.76001,192330000,358.76001
1990-01-04,358.76001,358.76001,352.890015,355.670013,177000000,355.670013
1990-01-05,355.670013,355.670013,351.350006,352.200012,158530000,352.200012
1990-01-08,352.200012,354.23999,350.540009,353.790009,140110000,353.790009


In [22]:
old_columns = list(df.columns.values)  # Get the column labels
ticker_tag = ticker.split('_')[-1] + '_'  # Use the ticker symbol as our new prefix
new_labels = [ticker_tag + i.replace(' ', '') for i in old_columns]  # Drop spaces and concatenate
new_columns = dict(zip(old_columns, new_labels))  # Create a dictionary of old and new column labels
new_columns  # Show the column label dictionary

{'Adj Close': 'GSPC_AdjClose',
 'Close': 'GSPC_Close',
 'High': 'GSPC_High',
 'Low': 'GSPC_Low',
 'Open': 'GSPC_Open',
 'Volume': 'GSPC_Volume'}

In [25]:
df = df.rename(columns=new_columns)  # Rename the columns using our dictionary

In [26]:
df.head()  # Show top of dataframe

Unnamed: 0_level_0,GSPC_Open,GSPC_High,GSPC_Low,GSPC_Close,GSPC_Volume,GSPC_AdjClose
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1990-01-02,353.399994,359.690002,351.980011,359.690002,162070000,359.690002
1990-01-03,359.690002,360.589996,357.890015,358.76001,192330000,358.76001
1990-01-04,358.76001,358.76001,352.890015,355.670013,177000000,355.670013
1990-01-05,355.670013,355.670013,351.350006,352.200012,158530000,352.200012
1990-01-08,352.200012,354.23999,350.540009,353.790009,140110000,353.790009


In [35]:
nulls = df[~df.applymap(np.isreal).all(1)]  # Search for non-real numbers by negation
nulls  # Show any rows in the dataframe with non-numeric values 

Unnamed: 0_level_0,GSPC_Open,GSPC_High,GSPC_Low,GSPC_Close,GSPC_Volume,GSPC_AdjClose
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1


In [36]:
# Add error checking for non-numeric values
if len(nulls) > 0:
    raise ValueError('Dataframe contains non-numeric values')