In [2]:
import numpy as np
import pandas as pd
from scipy.io import loadmat
%matplotlib inline

In [3]:
import urllib
import tempfile
def loadmat_from_url(url):
    """
    Load a matlab .mat file from a url via a temporary file
    """
    response = urllib.request.urlopen(url)
    data = response.read()
    with tempfile.TemporaryFile() as fp:
        fp.write(data)
        fp.seek(0)
        mat = loadmat(fp)
    return mat

In [4]:
def clean_mat(mat):
    """
    Remove non-array data, squeeze arrays, and fix dates
    """
    newmat = {}
    for k in mat:
        if k == 'date':
            newmat[k] = [pd.datetime(*row) for row in mat[k]]
        else:
            try:
                newmat[k] = mat[k].squeeze()
            except AttributeError:
                # not an array
                pass
    return newmat

In [None]:
years = range(2001,2013)
urls = [f"http://oleander.bios.edu/files/Ol_TSG_{y}.mat" for y in years]
dataframes = [pd.DataFrame(clean_mat(loadmat_from_url(url))) for url in urls]
df = pd.concat(dataframes)

In [7]:
# save to hdf5 so we don't have to download again
df.to_hdf('Ol_TSG_2001-2012.pandas.h5', 'Ol_TSG')