In [49]:
import boto3
import pandas as pd
import numpy as np
import re
import io

In [3]:
AWS_S3_BUCKET = 'esgnie'

def get_s3_companies(s3_client, bucket='esgnie', comp_prefix='companies/'):
    ## get companies
    result = s3_client.list_objects(Bucket=bucket, Prefix=comp_prefix, Delimiter='/')
    l_subdir = [com_prefix.get('Prefix').replace(comp_prefix, '').replace('/', '')
                for com_prefix in result.get('CommonPrefixes')]
    return l_subdir

def get_company_files(s3_client, ticker, bucket='esgnie', filename_pattern=None, comp_prefix='companies'):
    result = s3_client.list_objects(Bucket=bucket, Prefix=f"{comp_prefix}/{ticker}/")
    df_files = pd.DataFrame()
    df_files['file'] = [obj['Key'] for obj in result['Contents']]
    df_files['filename'] = [file.split('/')[-1] for file in df_files['file']]
    if filename_pattern is not None:
        df_files['pattern_match'] = [1 if re.match(filename_pattern, file) else 0 for file in df_files['filename']]
    # df_files[df_files['pattern_match'] == 1]['filename'].tolist()
    return df_files[df_files['pattern_match'] == 1]['file'].tolist()

def write_csv_to_s3(s3_client, df, file, bucket='esgnie'):
    with io.StringIO() as csv_buffer:
        df.to_csv(csv_buffer, index=False)
        response = s3_client.put_object(
            Bucket=bucket, Key=file, Body=csv_buffer.getvalue()
        )
        status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")
    return status

def get_file_names(s3_client,ticker,bucket = 'esgnie'):
    l_files = get_company_files(s3_client, ticker, filename_pattern = "^df_words")
    files = [re.findall(r"(?<=df_words_).*(?=_pagenum)", l_files[i])[0] for i in range(len(l_files))]
    return list(set(files))

In [4]:
def get_file_names(s3_client,ticker,bucket = 'esgnie'):
    l_files = get_company_files(s3_client, ticker, filename_pattern = "^df_words")
    files = [re.findall(r"(?<=df_words_).*(?=_pagenum)", l_files[i])[0] for i in range(len(l_files))]
    return list(set(files))

In [5]:
s3_client = boto3.client('s3')
ticker = "500355"
file_names = get_file_names(s3_client, ticker)
file_names

['61658713-a999-43b5-b082-b3a3ff4542b7',
 '2dc8a905-4320-4882-9cb6-c73a5890f468',
 '2cb0ee35-8ac4-4bf5-9bbd-19fe797e1a2b',
 '3e716270-48a2-4eec-8797-90502884efcf',
 '5c3699be-4257-4df8-b87c-208aa6271eb7',
 'Boards-Report',
 'Consolidated-Financial-Statements']

In [100]:
def parser(ticker, filename = None):
    
    """
    
    1. Find whether page has director_name_flag > 60
    2. If have, then find whether there are rows with director_names whose numeric_flag is true
    3. If have, then add this page to list of pages to parse
    4. Parse flag ratios for pages
    
    """
    
    if filename != None:
        l_files = get_company_files(s3_client, ticker=ticker, filename_pattern='^df_words_%s'%filename)
    else:
        l_files = get_company_files(s3_client, ticker=ticker, filename_pattern='^df_words')
    
    if len(l_files) == 0:
        print("no files for %s"%ticker)
        return
    
    sums = []
    proportions = []
    pagenum = []
    filenames = []
    tickers = []
    
    
    for i,file in enumerate(l_files):
        obj = s3_client.get_object(Bucket='esgnie', Key=file)
        df = pd.read_csv(obj['Body'])
    
    
        # add numeric flag

        is_numeric = []

        for text in list(df.text):  
            t="".join(c for c in str(text) if c.isalnum())
            is_numeric.append(t.isnumeric())

        df["numeric_flag"] = is_numeric
    
    
        if "director_name_flag" not in df.columns:
            print("no director_name_flag for %s %s"%(ticker, df.filename[0]))
            continue
        
        if (df.director_name_flag > 60).any():
            if (df[df.row_num.isin(df[df.director_name_flag > 60].row_num)].numeric_flag == True).any():
                
                num_words = len(df)

                def flag_sum(df):
                    return df.sum()

                def flag_proportion(df):
                    return len(df[df > 50])

                try:
                    flag_sum = df[df.columns[-10:-2]].apply(flag_sum).sum()
                    flag_proportion = df[df.columns[-10:]].apply(flag_proportion).sum()/num_words
                except:
                    print("error at %s %s %d"%(ticker,df.filename[0],df.pagenum[0]))
                    continue

                sums.append(flag_sum)
                proportions.append(flag_proportion)
                pagenum.append(df.pagenum.iloc[0])
                filenames.append(df.filename.iloc[0])
                tickers.append(ticker)
                
        
        if i % 75 == 0:
            print("%s: %s files parsed"%(ticker, i))
                
    overall = pd.DataFrame({"ticker": tickers, "filename": filenames,
                        "pagenum": pagenum, "flag_sum": sums, "flag_proportion": proportions})

    overall.sort_values(by=['filename','flag_sum', "flag_proportion"], ascending = False, inplace = True)
    
    
    try:    
        if filename == None:
            filename = "all-files"
        status = write_csv_to_s3(s3_client, overall, "governance_data/%s_%s.csv"%(ticker,filename), bucket='esgnie')
        if status == 200:
            print("Successful saving governance data for %s_%s"%(ticker, filename))
    except:
        print("Unsuccessful saving governance data for %s_%s"%(ticker, filename))
        
    return overall
        

In [66]:
l_ticker = get_s3_companies(s3_client)

In [77]:
for ticker in l_ticker[1:6]:
    parser(ticker)

no files for 500114
no files for 500180
no files for 500182
Successful saving governance data for 500209_all-files
Successful saving governance data for 500355_all-files


In [101]:
for ticker in l_ticker[8:12]:
    parser(ticker)

500575: 0 files parsed
500575: 75 files parsed
500575: 150 files parsed
500575: 225 files parsed
error at 500575 SE_Letter_to_Stock_Exchange_Annual_Report_2018-1945482 11
error at 500575 SE_Letter_to_Stock_Exchange_Annual_Report_2018-1945482 12
error at 500575 SE_Letter_to_Stock_Exchange_Annual_Report_2018-1945482 14
500575: 300 files parsed
error at 500575 SE_Letter_to_Stock_Exchange_Annual_Report_2018-1945482 16
error at 500575 SE_Letter_to_Stock_Exchange_Annual_Report_2018-1945482 2
500575: 375 files parsed
error at 500575 SE_Letter_to_Stock_Exchange_Annual_Report_2018-1945482 23
error at 500575 SE_Letter_to_Stock_Exchange_Annual_Report_2018-1945482 28
error at 500575 SE_Letter_to_Stock_Exchange_Annual_Report_2018-1945482 3
500575: 450 files parsed
error at 500575 SE_Letter_to_Stock_Exchange_Annual_Report_2018-1945482 4
error at 500575 SE_Letter_to_Stock_Exchange_Annual_Report_2018-1945482 8
error at 500575 SE_Letter_to_Stock_Exchange_Annual_Report_2018-1945482 81
error at 500575 SE

In [104]:
for ticker in l_ticker[14:18]:
    parser(ticker)

532187: 0 files parsed
Successful saving governance data for 532187_all-files
532540: 0 files parsed
532540: 75 files parsed
532540: 150 files parsed
532540: 225 files parsed
532540: 300 files parsed
532540: 375 files parsed
532540: 450 files parsed
532540: 525 files parsed
532540: 600 files parsed
532540: 675 files parsed
532540: 750 files parsed
532540: 825 files parsed
532540: 900 files parsed
532540: 975 files parsed
Successful saving governance data for 532540_all-files
no files for 600028
no director_name_flag for 600029 3c29183e2343c78126eb686943d79ed0
no director_name_flag for 600029 3c29183e2343c78126eb686943d79ed0
no director_name_flag for 600029 3c29183e2343c78126eb686943d79ed0
no director_name_flag for 600029 3c29183e2343c78126eb686943d79ed0
no director_name_flag for 600029 3c29183e2343c78126eb686943d79ed0
no director_name_flag for 600029 3c29183e2343c78126eb686943d79ed0
no director_name_flag for 600029 3c29183e2343c78126eb686943d79ed0
no director_name_flag for 600029 3c291

no director_name_flag for 600029 f78d4cc566c3d90fb988223c7a4c22e3
no director_name_flag for 600029 f78d4cc566c3d90fb988223c7a4c22e3
no director_name_flag for 600029 f78d4cc566c3d90fb988223c7a4c22e3
no director_name_flag for 600029 f78d4cc566c3d90fb988223c7a4c22e3
no director_name_flag for 600029 f78d4cc566c3d90fb988223c7a4c22e3
no director_name_flag for 600029 f78d4cc566c3d90fb988223c7a4c22e3
no director_name_flag for 600029 f78d4cc566c3d90fb988223c7a4c22e3
no director_name_flag for 600029 f78d4cc566c3d90fb988223c7a4c22e3
no director_name_flag for 600029 f78d4cc566c3d90fb988223c7a4c22e3
no director_name_flag for 600029 f78d4cc566c3d90fb988223c7a4c22e3
no director_name_flag for 600029 f78d4cc566c3d90fb988223c7a4c22e3
no director_name_flag for 600029 f78d4cc566c3d90fb988223c7a4c22e3
no director_name_flag for 600029 f78d4cc566c3d90fb988223c7a4c22e3
no director_name_flag for 600029 f78d4cc566c3d90fb988223c7a4c22e3
no director_name_flag for 600029 f78d4cc566c3d90fb988223c7a4c22e3
no directo