In [55]:
from functools import total_ordering
import calendar
import zipfile
import os
import pandas as pd
import datetime
import matplotlib.pyplot as plt

In [56]:
@total_ordering
class FolderName:
    month_index = dict((month, index) for index, month in enumerate(calendar.month_name) if month)

    def __init__(self, name: str):
        x = name.split('_')
        self.year = int(x[1])
        self.month = x[2]

    @staticmethod
    def _is_valid_operand(other):
        return hasattr(other, "year") and hasattr(other, "month")

    def __eq__(self, other):
        if not self._is_valid_operand(other):
            return NotImplemented
        return self.year == other.year and self.month == other.month

    def __lt__(self, other):
        if not self._is_valid_operand(other):
            return NotImplemented
        if self.year == other.year:
            return self.month_index[self.month] < self.month_index[other.month]
            # return self.month < other.month
        else:
            return self.year < other.year

    def to_string(self):
        return f'L2_{self.year}_{self.month}'

    def __repr__(self):
        return self.to_string()

In [57]:
def unzip_all(archive, unzip_to, skip=False):
    if skip:
        return
    files = [f for f in os.listdir(archive) if f.endswith('.zip')]
    total_files = len(files)
    print(f'Total files {total_files}')
    print(files)
    counter = 1
    for f in files:
        f_zip = os.path.join(archive, f)
        f_archive = os.path.splitext(os.path.join(unzip_to, f))[0]
        print(f'Start unzipping {f_zip} to {f_archive} ({counter}/{total_files})')
        if os.path.exists(f_archive):
            print(f'Path {f_archive} exists. Skipping')
            counter = counter + 1
            continue
        with zipfile.ZipFile(f_zip, 'r') as zip_ref:
            zip_ref.extractall(f_archive)
        print(f'Finish unzipping {f_zip} to {f_archive} ({counter}/{total_files})')
        counter = counter + 1

In [58]:
def parse_file(parse_folder, single, max_files=10, asset='AAPL', proces_func=None):
    folders = [FolderName(f) for f in os.listdir(parse_folder)]
    folders.sort()
    if single:
        folders = [folders[0]]

    total_files = 0
    for folder in folders:
        path = os.path.join(parse_folder, folder.to_string())
        files = [f for f in os.listdir(path) if f.startswith('L2_options_')]
        total_files = total_files + len(files)

    print(f"Total number of files (days): {total_files}")
    max_files = min(max_files, total_files)

    i = 1
    for folder in folders:
        if i > max_files:
            return
        path = os.path.join(parse_folder, folder.to_string())
        files = [f for f in os.listdir(path) if f.startswith('L2_options_')]
        if single:
            files = [files[0]]

        for file in files:
            if i > max_files:
                return
            d = file.split('_')[2].split('.')[0]
            date = datetime.datetime.strptime(d, "%Y%m%d").date()
            path_file = os.path.join(path, file)

            df = pd.read_csv(path_file, sep=",").filter(['UnderlyingSymbol', 'UnderlyingPrice', 'Exchange', 'Type', 'Expiration',
                   'DataDate', 'Strike', 'Last', 'Bid', 'Ask', 'Volume', 'OpenInterest', 'IV', 'Delta', 'Gamma',
                   'Theta', 'Vega'])
            output = df[df.UnderlyingSymbol == asset]

            if proces_func is not None:
                proces_func(output, date, asset, i)

            print(f"{date} {i}/{max_files}")
            i = i + 1

In [59]:
index = []
result = []


def get_dayliy_spot(df: pd.DataFrame, date: datetime.date, asset: str, i):
    index.append(date)
    result.append(df.iloc[0]['UnderlyingPrice'])

In [60]:
def merge_into_one_file(df: pd.DataFrame, date: datetime.date, asset: str, i):
    should_write_header = i == 1
    df.to_csv(f"{asset}.csv", mode='a', header=should_write_header)

In [61]:
archive_folder = '../HistoricalData/'
unzip_to = '../Unzip/'
unzip_all(archive_folder, unzip_to, True)

# by max_files you can limit number of files to be processed. Total number of files is 1301
#parse_file(unzip_to, single=False, max_files=0, proces_func=get_dayliy_spot)
parse_file(unzip_to, single=False, max_files=2, asset='MSFT', proces_func=merge_into_one_file)

Total number of files (days): 1301
2016-11-01 1/2
2016-11-02 2/2
