In [12]:
import boto3
import pandas as pd
import numpy as np
import re

In [1]:
AWS_S3_BUCKET = 'esgnie'

def get_s3_companies(s3_client, bucket='esgnie', comp_prefix='companies/'):
    ## get companies
    result = s3_client.list_objects(Bucket=bucket, Prefix=comp_prefix, Delimiter='/')
    l_subdir = [com_prefix.get('Prefix').replace(comp_prefix, '').replace('/', '')
                for com_prefix in result.get('CommonPrefixes')]
    return l_subdir

def get_company_files(s3_client, ticker, bucket='esgnie', filename_pattern=None, comp_prefix='companies'):
    result = s3_client.list_objects(Bucket=bucket, Prefix=f"{comp_prefix}/{ticker}/")
    df_files = pd.DataFrame()
    df_files['file'] = [obj['Key'] for obj in result['Contents']]
    df_files['filename'] = [file.split('/')[-1] for file in df_files['file']]
    if filename_pattern is not None:
        df_files['pattern_match'] = [1 if re.match(filename_pattern, file) else 0 for file in df_files['filename']]
    # df_files[df_files['pattern_match'] == 1]['filename'].tolist()
    return df_files[df_files['pattern_match'] == 1]['file'].tolist()

def write_csv_to_s3(s3_client, df, file, bucket='esgnie'):
    with io.StringIO() as csv_buffer:
        df.to_csv(csv_buffer, index=False)
        response = s3_client.put_object(
            Bucket=bucket, Key=file, Body=csv_buffer.getvalue()
        )
        status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")
    return status

In [5]:
def get_file_names(s3_client,ticker,bucket = 'esgnie'):
    l_files = get_company_files(s3_client, ticker, filename_pattern = "^df_words")
    files = [re.findall(r"(?<=df_words_).*(?=_pagenum)", l_files[i])[0] for i in range(len(l_files))]
    return list(set(files))

In [4]:
s3_client = boto3.client('s3')

In [6]:
ticker = "500355"

In [17]:
file_names = get_file_names(s3_client, ticker)

In [18]:
file_names

['Consolidated-Financial-Statements',
 '5c3699be-4257-4df8-b87c-208aa6271eb7',
 'Boards-Report',
 '2cb0ee35-8ac4-4bf5-9bbd-19fe797e1a2b',
 '2dc8a905-4320-4882-9cb6-c73a5890f468',
 '61658713-a999-43b5-b082-b3a3ff4542b7',
 '3e716270-48a2-4eec-8797-90502884efcf']

In [54]:
def parse(ticker, filename):
    
    l_files = get_company_files(s3_client, ticker=ticker, filename_pattern='^df_words_%s'%filename)
    
    overall1 = np.empty((0,25))
    
       
    
    
    for file in l_files[0:10]:
        obj = s3_client.get_object(Bucket='esgnie', Key=file)
        df = pd.read_csv(obj['Body'])
        
        
        #add numeric flag
        
        is_numeric = []

        for text in list(df.text):  
            t="".join(c for c in text if c.isalnum())
            is_numeric.append(t.isnumeric())

        df["numeric_flag"] = is_numeric
        
        
        
        mask1 = df["director_name_flag"] > 60
        mask2 = df["board_flag"] > 0        
        
        
        overall1 = np.concatenate((overall1, np.array(df[mask1 & mask2])))
        
    overall1 = pd.DataFrame(overall1, columns = df.columns)
    
    
    
        
    mask3 = overall.row_num.isin(overall[overall.director_name_flag > 60].row_num)
    mask4 = overall.numeric_flag == True
    if (overall[mask3 & mask4].empty
        
    
    
    
    overall1.drop(list(overall.columns[1:9]) + ["file"], axis = 1, inplace = True)
        
    pages = overall1.pagenum.unique()    
    
    
    
    
    sums = []
    proportions = []
    pagenum = []
    filenames = []
    tickers = []

    
    for page in pages:
        
        file = get_company_files(s3_client, ticker=ticker, filename_pattern = '^df_words_%s_pagenum-%s'%(filename, page))[0]
        obj = s3_client.get_object(Bucket='esgnie', Key=file)
        df = pd.read_csv(obj['Body'])
        

        #if (df.director_name_flag == 100).any() == True:
        
        num_words = len(df)

        def flag_sum(df):
            return df.sum()

        def flag_proportion(df):
            return len(df[df > 50])

        flag_sum = df[df.columns[-10:]].apply(flag_sum).sum()
        flag_proportion = df[df.columns[-10:]].apply(flag_proportion).sum()/num_words

        sums.append(flag_sum)
        proportions.append(flag_proportion)
        pagenum.append(df.pagenum.iloc[0])
        filenames.append(df.filename.iloc[0])
        tickers.append(ticker)

        
        
    overall2 = pd.DataFrame({"ticker": tickers, "filename": filenames,
                            "pagenum": pagenum, "flag_sum": sums, "flag_proportion": proportions})

    overall2.sort_values(by=['filename','flag_sum', "flag_proportion"], ascending = False, inplace = True)
    

    
    return overall1, overall2

In [55]:
overall1, overall2 = parse(ticker, file_names[2])

In [81]:
overall1.iloc[9].director_name_flag = 60
overall1

Unnamed: 0,text,x0,x1,top,bottom,upright,direction,xbar,ybar,row_num,...,executive_flag,board_flag,commission_flag,fee_flag,committee_flag,remuneration_flag,meeting_flag,attendance_flag,governance_flag,director_name_flag
0,Bhaskar,473.368,504.48,670.297,679.297,True,1,488.924,674.797,46.0,...,0,50,12,0,0,11,0,24,12,100
1,Bhat,506.343,524.352,670.297,679.297,True,1,515.3475,674.797,46.0,...,15,44,0,0,15,25,18,29,14,100
2,Bhaskar,473.368,504.48,518.219,527.219,True,1,488.924,522.719,31.0,...,0,50,12,0,0,11,0,24,12,100
3,Bhat,506.343,524.352,518.219,527.219,True,1,515.3475,522.719,31.0,...,15,44,0,0,15,25,18,29,14,100
4,Mukundan,145.745,184.84,360.483,369.483,True,1,165.2925,364.983,20.0,...,12,15,22,0,12,50,27,44,33,100
5,Punita,136.349,159.056,376.872,385.872,True,1,147.7025,381.372,21.0,...,27,18,12,0,27,33,15,25,25,100
6,Kumar,161.027,184.426,376.872,385.872,True,1,172.7265,381.372,21.0,...,14,40,13,0,14,24,17,13,13,100
7,Sinha,186.397,206.188,376.872,385.872,True,1,196.2925,381.372,21.0,...,14,20,40,0,14,24,33,27,27,100
8,Sanjiv,137.681,158.642,393.261,402.261,True,1,148.1615,397.761,22.0,...,27,18,25,0,13,22,15,25,25,100
9,Lal,160.613,170.639,393.261,402.261,True,1,165.626,397.761,22.0,...,0,25,0,0,0,13,0,15,15,60


In [86]:
(overall1.director_name_flag == 60).any()

True

In [57]:
overall2

Unnamed: 0,ticker,filename,pagenum,flag_sum,flag_proportion
0,500355,Boards-Report,15,115988,0.131532
2,500355,Boards-Report,18,110591,0.16763
1,500355,Boards-Report,17,87676,0.177778


# Debugging

1. Find whether page has director_name_flag > 60
2. If have, then find whether there are rows with director_names whose numeric_flag is true
3. If have, then add this page to list of pages to parse
2. Parse flag ratios for pages

In [30]:
overall.drop(list(overall.columns[1:9]) + ["file"], axis = 1)

Unnamed: 0,text,row_num,pagenum,company,filename,director_flag,executive_flag,board_flag,commission_flag,fee_flag,committee_flag,remuneration_flag,meeting_flag,attendance_flag,governance_flag,director_name_flag
0,Bhaskar,46.0,15,,Boards-Report,13,0,50,12,0,0,11,0,24,12,100
1,Bhat,46.0,15,,Boards-Report,17,15,44,0,0,15,25,18,29,14,100
2,Bhaskar,31.0,17,,Boards-Report,13,0,50,12,0,0,11,0,24,12,100
3,Bhat,31.0,17,,Boards-Report,17,15,44,0,0,15,25,18,29,14,100
4,Mukundan,20.0,18,,Boards-Report,12,12,15,22,0,12,50,27,44,33,100
5,Punita,21.0,18,,Boards-Report,29,27,18,12,0,27,33,15,25,25,100
6,Kumar,21.0,18,,Boards-Report,15,14,40,13,0,14,24,17,13,13,100
7,Sinha,21.0,18,,Boards-Report,15,14,20,40,0,14,24,33,27,27,100
8,Sanjiv,22.0,18,,Boards-Report,14,27,18,25,0,13,22,15,25,25,100
9,Lal,22.0,18,,Boards-Report,0,0,25,0,0,0,13,0,15,15,100


In [38]:
(overall.commission_flag == 12).any() == True

True

In [47]:
for i in (overall.pagenum.unique()):
    print(i)

16
15
18
17


In [48]:
filename = "Boards-Report"
pagenum = 18
'^df_words_%s_pagenum-%s'%(filename, pagenum)

'^df_words_Boards-Report_pagenum-18'

In [59]:
file = get_company_files(s3_client, ticker=ticker, filename_pattern = '^df_words_%s_pagenum-%s'%(filename, pagenum))[0]
obj = s3_client.get_object(Bucket='esgnie', Key=file)
df = pd.read_csv(obj['Body'])

In [88]:
(df[df.row_num.isin(df[df.director_name_flag > 60].row_num)].numeric_flag == True). any()

True

In [74]:
mask1 = df.row_num.isin(df[df.director_name_flag > 60].row_num)
mask2 = df.numeric_flag == True
df[mask1 & mask2].empty

False

In [71]:
is_numeric = []

for text in list(df.text):  
    t="".join(c for c in text if c.isalnum())
    is_numeric.append(t.isnumeric())

df["numeric_flag"] = is_numeric
df

Unnamed: 0,text,x0,x1,top,bottom,upright,direction,xbar,ybar,row_num,...,board_flag,commission_flag,fee_flag,committee_flag,remuneration_flag,meeting_flag,attendance_flag,governance_flag,director_name_flag,numeric_flag
0,Company,178.138,218.384,39.452,48.452,True,1,198.2610,43.952,0.0,...,33,47,0,38,32,29,24,35,35,False
1,overview,220.256,261.721,39.452,48.452,True,1,240.9885,43.952,0.0,...,31,22,36,35,30,27,22,56,22,False
2,Statutory,306.346,354.170,39.452,48.452,True,1,330.2580,43.952,0.0,...,29,21,0,22,29,12,32,21,32,False
3,reportS,355.961,393.529,39.452,48.452,True,1,374.7450,43.952,0.0,...,33,24,20,25,42,29,12,24,32,False
4,FinanCial,432.320,474.675,39.452,48.452,True,1,453.4975,43.952,0.0,...,14,21,17,22,29,25,42,42,32,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
514,year-on-year,241.631,289.125,671.018,680.018,True,1,265.3780,675.518,41.0,...,50,24,20,25,32,14,24,35,35,False
515,Annual,408.513,438.393,724.249,734.249,True,1,423.4530,729.249,42.0,...,18,12,0,0,22,15,38,25,33,False
516,Report,440.513,468.942,724.249,734.249,True,1,454.7275,729.249,42.0,...,36,12,22,27,44,31,12,25,25,False
517,2020-21,471.062,504.912,724.249,734.249,True,1,487.9870,729.249,42.0,...,0,0,0,0,0,0,0,0,12,True
