In [2]:
import multiprocessing as mp
import pandas as pd
import os

In [3]:
directory = 'data'
input_files = [o for o in os.listdir(directory) if os.path.isdir(os.path.join(directory,o))]

In [4]:
input_files

['2014q3',
 '2014q4',
 '2018q1',
 '2010q2',
 '2012q1',
 '2016q1',
 '2014q2',
 '2010q3',
 '2010q4',
 '2020q1',
 '2009q3',
 '2009q4',
 '2009q2',
 '2013q2',
 '2011q1',
 '2019q2',
 '2017q3',
 '2017q4',
 '2019q3',
 '2019q4',
 '2013q3',
 '2013q4',
 '2015q1',
 '2017q2',
 '2018q2',
 '2010q1',
 '2012q2',
 '2016q4',
 '2016q3',
 '2012q4',
 '2012q3',
 '2018q4',
 '2018q3',
 '2016q2',
 '2014q1',
 '2009q1',
 '2015q4',
 '2015q3',
 '2013q1',
 '2011q2',
 '2019q1',
 '2015q2',
 '2017q1',
 '2011q4',
 '2011q3']

In [5]:
def convert(df):
    output = dict()
    for index, row in df.iterrows():
        cik = row["cik"]
        feature = row["tag"]
        value = row["value"]
        
        if cik not in output:
            output[cik] = dict()
            init_keys = ["cik", "ddate", "name", 'adsh', "fy", "fp"]
            for k in init_keys:
                output[cik][k] = row[k]

        output[cik][feature] = value
    return output

In [6]:
def parse(input_folder):    
    print(input_folder)
    num_file = "data/" + input_folder + "/num.txt" 
    num = pd.read_csv(num_file, sep="\t", encoding="latin1")
    
    sub_file = "data/" + input_folder + "/sub.txt"
    sub = pd.read_csv(sub_file, sep="\t")
    
    num = num[(num["qtrs"] < 2)]
    num.dropna(subset=["value"], inplace=True)

    sub.rename(columns={'period': 'ddate'}, inplace=True)
    sub = sub[(sub["form"] == "10-K") | (sub["form"] == "10-Q")]
    sub = sub.drop_duplicates(subset="cik", keep=False)
    
    df = num.merge(sub,how='inner',left_on=['adsh','ddate'],right_on=['adsh','ddate'])
    df = df.drop_duplicates(subset=["cik", "tag", "adsh"], keep=False)
    
    features = set(df.tag.value_counts().head(n=1000).keys())
    df = df[(df["tag"].isin(features))]
    
    output = convert(df)
    parsed = pd.DataFrame.from_dict(output, orient='index')
    output_file = input_folder + ".csv"
    parsed.to_csv(output_file, index=False)
    
    return "done: " + input_folder

In [7]:
pool = mp.Pool(processes=8)

2014q4
2012q1
2014q3
2010q2
2018q1
2016q1
2010q3
2014q2
2010q4
2020q1
2009q3
2009q4


  self.run()


2009q2
2013q2
2011q1
2019q2
2017q3
2017q4
2019q3
2019q4
2013q3
2013q4
2015q1


  self.run()


2017q2
2018q2
2010q1
2012q2
2016q4
2016q3
2012q4
2012q3
2018q4
2018q3
2016q2
2014q1
2009q1
2015q4
2015q3


  self.run()


2013q1
2011q2
2019q1
2015q2
2017q1
2011q4
2011q3


In [8]:
results = [pool.apply_async(parse, args=(file,)) for file in input_files]
output = [p.get() for p in results]

In [9]:
output

['done: 2014q3',
 'done: 2014q4',
 'done: 2018q1',
 'done: 2010q2',
 'done: 2012q1',
 'done: 2016q1',
 'done: 2014q2',
 'done: 2010q3',
 'done: 2010q4',
 'done: 2020q1',
 'done: 2009q3',
 'done: 2009q4',
 'done: 2009q2',
 'done: 2013q2',
 'done: 2011q1',
 'done: 2019q2',
 'done: 2017q3',
 'done: 2017q4',
 'done: 2019q3',
 'done: 2019q4',
 'done: 2013q3',
 'done: 2013q4',
 'done: 2015q1',
 'done: 2017q2',
 'done: 2018q2',
 'done: 2010q1',
 'done: 2012q2',
 'done: 2016q4',
 'done: 2016q3',
 'done: 2012q4',
 'done: 2012q3',
 'done: 2018q4',
 'done: 2018q3',
 'done: 2016q2',
 'done: 2014q1',
 'done: 2009q1',
 'done: 2015q4',
 'done: 2015q3',
 'done: 2013q1',
 'done: 2011q2',
 'done: 2019q1',
 'done: 2015q2',
 'done: 2017q1',
 'done: 2011q4',
 'done: 2011q3']