This notebook should be run after the 474 notebook in order to take the scores generated from the 10k reports and put them in a more managable csv format 

In [7]:
import pandas as pd
import numpy as np
import os
import pickle
from tqdm.notebook import tqdm

In [51]:
def add_dates_to_scores(cik):
    '''
    This function does postprocessing of the 10k data
    input: cik number
    output: 
        DataFrame of the quarterly scores for a cik
        Dict of stats for the cik
    '''
    cik = str(cik)
    met_path = '10ks/' + cik + '/metrics/' + cik + '_sim_scores.csv'
    fname_list = [fname for fname in os.listdir('10ks/'+cik) if not fname.startswith('.')]
    date_list = [date[len(cik)+1:-5] for date in fname_list if date[-4:] == 'html']
    cik_df = pd.read_csv(met_path)
    #cik_df.dropna(axis='index', inplace = True)
    cik_df['dates'] = sorted(date_list)
    stats = {'cik' : cik,
            'first_date' : sorted(date_list)[0],
            'last_date' : sorted(date_list)[-1],
            'min_cos' : cik_df['cosine_score'].min(),
            'max_cos' : cik_df['cosine_score'].max(),
            'avg_cos' : cik_df['cosine_score'].mean(),
            'min_jaccard' : cik_df['jaccard_score'].min(),
            'max_jaccard' : cik_df['jaccard_score'].max(),
            'avg_jaccard' : cik_df['jaccard_score'].mean()}
    return cik_df, stats

## Testing the workflow
Trying the function on one cik

In [53]:
# Test 1
cik_df, stats = add_dates_to_scores(1750)
stats

{'cik': '846475',
 'first_date': '2009-04-15',
 'last_date': '2020-02-27',
 'min_cos': 0.8254532981560887,
 'max_cos': 0.9333364332564448,
 'avg_cos': 0.8794566044280173,
 'min_jaccard': 0.7025597630632536,
 'max_jaccard': 0.8749433620299049,
 'avg_jaccard': 0.7856938701770915}

In [None]:
stat_labels = ['cik',
               'first_date','last_date',
               'min_cos','max_cos','avg_cos',
               'min_jaccard','max_jaccard','avg_jaccard']

In [63]:
# Test 2
stats_df = pd.DataFrame(columns=stat_labels)
stats_df = stats_df.append(stats, ignore_index=True)
stats_df

Unnamed: 0,cik,first_date,last_date,min_cos,max_cos,avg_cos,min_jaccard,max_jaccard,avg_jaccard
0,846475,2009-04-15,2020-02-27,0.825453,0.933336,0.879457,0.70256,0.874943,0.785694


In [71]:
# Test 3
stats_df.first_date = pd.to_datetime(stats_df.first_date)
stats_df.last_date = pd.to_datetime(stats_df.last_date)
stats_df.dtypes

cik                    object
first_date     datetime64[ns]
last_date      datetime64[ns]
min_cos               float64
max_cos               float64
avg_cos               float64
min_jaccard           float64
max_jaccard           float64
avg_jaccard           float64
dtype: object

## Processing the Data
Running through all the ciks

In [5]:
ticker_cik_df = pd.read_csv('tickers/tickers_and_ciks.csv')
ticker_cik_df.drop(columns=['Unnamed: 0'], inplace=True)
ticker_cik_df.head()

Unnamed: 0,ticker,cik
0,a,1090872
1,aa,1675149
2,aacg,1420529
3,aacq,1802457
4,aaic,1209028


In [73]:
stats_df = pd.DataFrame(columns=stat_labels)
for cik in tqdm(ticker_cik_df['cik']):
    try:
        # process the stats if there's scores for the cik
        cik_df, stats = add_dates_to_scores(cik)
        cik_df.to_csv('data/'+str(cik)+'.csv')
    except:
        # create an entry for the cik but make the rest of the values nan
        stats = {'cik': cik, 'first_date': np.nan, 'last_date': np.nan,
                 'min_cos': np.nan,'max_cos': np.nan,'avg_cos': np.nan,
                 'min_jaccard': np.nan,'max_jaccard': np.nan, 'avg_jaccard': np.nan}
    stats_df = stats_df.append(stats, ignore_index=True)
    # add the entry to the DF
stats_df.first_date = pd.to_datetime(stats_df.first_date)
stats_df.last_date = pd.to_datetime(stats_df.last_date)
# convert dates to datetime
stats_df.head()

HBox(children=(FloatProgress(value=0.0, max=5478.0), HTML(value='')))




Unnamed: 0,cik,first_date,last_date,min_cos,max_cos,avg_cos,min_jaccard,max_jaccard,avg_jaccard
0,1090870.0,NaT,NaT,,,,,,
1,1675150.0,NaT,NaT,,,,,,
2,1420530.0,NaT,NaT,,,,,,
3,1802460.0,NaT,NaT,,,,,,
4,1209028.0,2004-03-15,2020-02-24,0.830653,0.918406,0.879454,0.709508,0.849112,0.785378


## Adding the tickers
Self explanatory

In [108]:
stats_df.cik = stats_df.cik.astype(int) 
# this would be worth changing to str next time the\
# notebook is run and doing the same for the ticker_cik_df
stats_df = stats_df.merge(ticker_cik_df, how = 'left', left_on = 'cik', right_on = 'cik')
stats_df.head()

Unnamed: 0,cik,first_date,last_date,min_cos,max_cos,avg_cos,min_jaccard,max_jaccard,avg_jaccard,ticker
0,1090872,NaT,NaT,,,,,,,a
1,1675149,NaT,NaT,,,,,,,aa
2,1420529,NaT,NaT,,,,,,,aacg
3,1802457,NaT,NaT,,,,,,,aacq
4,1209028,2004-03-15,2020-02-24,0.830653,0.918406,0.879454,0.709508,0.849112,0.785378,aaic


In [109]:
stats_df.to_csv('cos_and_jaccard_stats.csv')

## Sorting the values based on the date range
This also automatically drops the null values

In [98]:
recent = stats_df[stats_df.last_date > pd.to_datetime('2019-01-01')]
established = recent[recent.first_date < pd.to_datetime('2005-01-01')]
established.sort_values('min_jaccard')

Unnamed: 0,cik,first_date,last_date,min_cos,max_cos,avg_cos,min_jaccard,max_jaccard,avg_jaccard
2563,49938,2002-03-26,2019-02-27,0.061823,0.929677,0.856626,0.005905,0.868587,0.780662
1394,28917,2000-04-28,2019-03-29,0.513697,0.967061,0.884845,0.275229,0.936216,0.799788
5457,109380,2002-03-26,2019-02-26,0.535993,0.914394,0.835301,0.296602,0.842277,0.720021
899,22356,2001-03-13,2019-02-21,0.548514,0.949932,0.884663,0.308466,0.904618,0.797882
1204,814676,2001-03-30,2020-03-13,0.530998,0.949580,0.869977,0.327127,0.903985,0.780882
...,...,...,...,...,...,...,...,...,...
3958,1168455,2004-03-29,2020-03-05,0.883937,0.948766,0.913701,0.790191,0.902521,0.841755
4349,1177702,2003-03-14,2019-02-25,0.887873,0.945072,0.920503,0.797835,0.895839,0.852960
3292,1260968,2004-03-29,2020-03-13,0.889080,0.952596,0.924141,0.800100,0.909480,0.859329
4346,1172052,2003-03-31,2019-02-28,0.891777,0.963343,0.936839,0.803800,0.929238,0.881636


In [113]:
final_ciks = [cik for cik in established.cik]
len(final_ciks)

303

In [117]:
f = open("final_ciks.txt","w+")
f.write(str(final_ciks))
f.close()