# output metric

In [30]:
output_metric= {
                "Emissions":
                    ["emission", "co2 emission", "ghg emission", "emission reduction", "emission target", "Emissions Policy"],
                "Water":
                    ["Water Withdrawal", "Water Discharged", "Water Recycled", "water risk"],
                "Energy":
                    ["Energy Consumed", "Renewable Energy"],
                "Business Ethics":
                    ["Women Executives", "Women Board Members", "Board Members", "Committee Independence", "ESG Sustainability Reporting"],
                "Labor Practices":
                    ["Trade Union", "CEO Salary", "Average Salary", "Employee Turnover", "Avg Training Hours"],
                "Employee Engagement, Diversity & Inclusion":
                    ["Women Employees", "Women Managers", "Minority Employees"],
                "Employee Health & Safety":
                    ["Lost Time", "Injury Rate"],
                "Waste":
                    ["Waste"]
                }

# tables

In [31]:
import camelot
import os

def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i : i + n]
def get_chunks(filepath, pages, chunk=10):
    """
    Divide the extraction work into n chunks and return this chunks.

    filepath : str
        Filepath or URL of the PDF file.
    pages : str, optional (default: '1')
        Comma-separated page numbers.
        Example: '1,3,4' or '1,4-end' or 'all'.
    """

    # get list of pages from camelot.handlers.PDFHandler
    handler = camelot.handlers.PDFHandler(filepath)
    page_list = handler._get_pages(pages=pages)
    # chunk pages list
    page_chunks = list(chunks(page_list, chunk))

    return page_chunks

In [32]:
def check_information_table(tables):
    for table in tables:
        # print(table.cols)
        # print(table.rows)
        # print(tabel.accuracy)
        table_df = table.df
        columns = table_df.shape[1]
        for key_word,value_words  in output_metric.items():
            for col in range(columns):
                contains = any(table.df[col].str.contains("|".join(value_words), case=False, regex=True).tolist())
                if contains:
                    position_record_table[key_word].append(str(table.page))
                    break

In [33]:
def parseTable_camelot(input_path, file, output_path, pages = "all"):
    print("parsing table from " + file + " at "+ pages + " pages " )
    try:
        filepath = os.path.join(input_path, file)
        page_chunks = get_chunks(filepath, pages=pages)
        for chunk in page_chunks:
            pages_string = str(chunk).replace("[", "").replace("]", "")
            tables = camelot.read_pdf(filepath, pages=pages_string)
            if len(tables) > 0:
                check_information_table(tables)
                # print("saving {len(tables)} tables")
                # accuracy = [str(table.parsing_report["accuracy"]) for table in tables]
                # print("accuracy list " + " ".join(accuracy))
                # tables.export(os.path.join(output_path, file.replace("pdf", "xlsx")), f='excel')
            else:
                print(f"no tables found for " + file)
    except Exception as e:
        print("error parsing table from " + file)    
        print(e)      

In [34]:
import tabula
import pandas as pd
def parseTable_tabula(input_path, file, output_path, pages):
    print("parsing table from " + file + "at pages" + pages)
    try:
        tables = tabula.read_pdf(os.path.join(input_path, file), pages=pages, lattice=True)
        if len(tables) > 0:
            print("saving {len(tables)} tables")
            with pd.ExcelWriter(os.path.join(output_path, file.replace("pdf", "xlsx"))) as f_obj:
                for i,table in enumerate(tables):
                    table.to_excel(f_obj, sheet_name=str(i))
        else:
            print(f"no tables found for " + file)
    except Exception as e:
        print("error parsing table from " + file)    
        print(e) 


# texts

## normalize 

In [35]:
import nltk
import math
import string
from nltk.corpus import stopwords
from collections import Counter
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

In [36]:
def get_tokens(text):
    """
    tokenize the text
    """
    lower = text.lower()
    remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
    no_punctuation = lower.translate(remove_punctuation_map)
    tokens = nltk.word_tokenize(no_punctuation)
    return tokens
def stem_tokens(tokens, stemmer):
    """
    remove morphological affixes from words, leaving only the word stem
    """
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed
def normalize(text):
    """
    applement the tokenize and stem
    """
    tokens = get_tokens(text)
    filtered = [w for w in tokens if not w in stopwords.words('english')]
    stemmer = SnowballStemmer("english") # PorterStemmer
    stemmed = stem_tokens(filtered, stemmer)
    return stemmed

In [None]:
normalize("GHG Emissions Disclosure (cont.) 3.Carbon dioxide equivalent (CO2e) emissions are inclusive of carbon dioxide (CO2),  nitrous oxide (N2O), methane (CH4), and industrial gases such as hydrofluorocarbons  (HFCs), and sulfur hexafluoride (SF6).  Perfluorocarbons (PFCs) and nitrogen trifluoride  (NF3) are not emitted by Tesla’s sites. These carbon dioxide equivalent emissions utilize  Global Warming Potentials (GWPs) defined by the Intergovernmental Panel on Climate  Change’s (IPCC) Fifth Assessment Report (AR5 – 100 year) unless a different  Assessment Report is already embedded in the emission factor source. Carbon dioxide  equivalent emissions are calculated by multiplying actual or estimated energy and fuel  usage by the relevant emission factor taking into account the equivalent GWP. All  emission factors are updated annually where applicable. Management Assertion Scope 1 & 2 GHG Emissions")

## synonyms

In [1]:
from nltk.corpus import wordnet as wn

def find_synonyms(word):
    """
    find the synonyms for given word
    """
    synonyms = []
    for syn in wn.synsets(word):
        for l in syn.lemmas():
            synonyms.append(l.name())
    return synonyms

## TF-IDF

In [39]:
def tf(word, count):
    return count[word] / sum(count.values()) if sum(count.values()) > 0 else 0
def n_containing(word, count_list):
    return sum(1 for count in count_list if word in count)
def idf(word, count_list):
    return math.log(len(count_list)) / (1 + n_containing(word, count_list))
def tfidf(word, count, count_list):
    return tf(word, count) * idf(word, count_list)

def cal_tfidf(text, value_word):
    text_stemmed = normalize(text)
    count = Counter(text_stemmed)
    values = " ".join(value_word)
    values_stemmed = list(set(normalize(values)))
    return sum(tf(word, count) * keywords_idf[word]  for word in values_stemmed)
    
# def cal_tfidf(text_list, key_words=[]):
#     countlist = []
#     for text in text_list:
#         countlist.append(cal_tf(text))
#     for i, count in enumerate(countlist):
#         print("Top words in document {}".format(i + 1))
#         scores = {word: tfidf(word, count, countlist) for word in count}
#         sorted_words = sorted(scores.items(), key = lambda x: x[1], reverse=True)
#         for word, score in sorted_words[:1]:
#             print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))

In [40]:
import os
import PyPDF2
import pandas as pd

input_path_idf = "./pdf_files_idf/pdf_files/"
pdf_files = os.listdir(input_path_idf)
documents = len(pdf_files)

keywords = [" ".join(value) for key, value in output_metric.items()]
keywords = " ".join(keywords)
keywords = set(normalize(keywords))
keywords_dict = {key:0 for key in keywords}
for file in pdf_files:
    try:
        word_list = set()
        object = PyPDF2.PdfFileReader(os.path.join(input_path_idf, file))
        num_pages = object.getNumPages()
        for i in range(0, num_pages):
            page = object.getPage(i)
            text = page.extractText()
            text = text.replace('\n',' ')
            text = text.replace(str(i+1), "", 1)
            text = set(normalize(text))
            word_list = word_list.union(text)
        for word in keywords_dict.keys():
            if word in word_list:
                keywords_dict[word] += 1
    except:
        documents -= 1
        print(file + "is invalid!")
keywords_idf = {key: (math.log(documents) / (1 + value)) for key, value in keywords_dict.items()}

Xref table not zero-indexed. ID numbers for objects will be corrected.
Xref table not zero-indexed. ID numbers for objects will be corrected.
Xref table not zero-indexed. ID numbers for objects will be corrected.
Xref table not zero-indexed. ID numbers for objects will be corrected.
Xref table not zero-indexed. ID numbers for objects will be corrected.
Xref table not zero-indexed. ID numbers for objects will be corrected.
Xref table not zero-indexed. ID numbers for objects will be corrected.
Xref table not zero-indexed. ID numbers for objects will be corrected.
Xref table not zero-indexed. ID numbers for objects will be corrected.
Xref table not zero-indexed. ID numbers for objects will be corrected.
Xref table not zero-indexed. ID numbers for objects will be corrected.


In [41]:
sorted([(key,value) for key, value in keywords_idf.items()], key = lambda x: x[1], reverse=True)

[('avg', 1.0729586082894003),
 ('discharg', 0.45983940355260006),
 ('co2', 0.24760583268216926),
 ('withdraw', 0.22991970177630003),
 ('union', 0.22991970177630003),
 ('lost', 0.22991970177630003),
 ('salari', 0.22991970177630003),
 ('turnov', 0.21459172165788004),
 ('minor', 0.15327980118420004),
 ('injuri', 0.15327980118420004),
 ('esg', 0.1463125374940091),
 ('trade', 0.1463125374940091),
 ('ghg', 0.13995112282035654),
 ('ceo', 0.13995112282035654),
 ('averag', 0.13411982603617503),
 ('reduct', 0.13411982603617503),
 ('hour', 0.13411982603617503),
 ('consum', 0.13411982603617503),
 ('women', 0.13411982603617503),
 ('independ', 0.13411982603617503),
 ('polici', 0.128755032994728),
 ('recycl', 0.128755032994728),
 ('water', 0.128755032994728),
 ('member', 0.128755032994728),
 ('renew', 0.128755032994728),
 ('committe', 0.128755032994728),
 ('target', 0.128755032994728),
 ('execut', 0.128755032994728),
 ('risk', 0.128755032994728),
 ('train', 0.12380291634108463),
 ('sustain', 0.123802

## tightness

In [None]:
import itertools

def get_distance(key_word, item):
    if w1 in item and w2 in item:
        w1_indexes = [index for index, value in enumerate(item) if value == w1]    
        w2_indexes = [index for index, value in enumerate(item) if value == w2]    
        distances = [abs(item[0] - item[1]) for item in itertools.product(w1_indexes, w2_indexes)]
        return {'min': min(distances), 'avg': sum(distances)/float(len(distances))}
    else:
        return 100
def get_proximity(distance):
    return 1 / (distance * distance)

##  calculate the relevance

In [42]:
import re
def score_function(tfidf, tight):
    return tfidf * tight
def cal_relevance(page, text):
    """
    check whether the text contains the key words
    step1: regular expressions
    step2: cal relevance
    """
    for key_word, value_words in output_metric.items():
        # pattern_words = []
        # for word in value_words:
        #     pattern_words.extend(normalize(word))
        # pattern_words = list(set(pattern_words))
        # pattern = "|".join(value_words)
        # rex = re.search(pattern, text, re.I)
        # if rex != None:
        position_record_text[key_word].append((str(page),score_function(cal_tfidf(text, value_words), )))

In [43]:
import PyPDF2
def parseText(input_path, file, output_path, pages):
    print("parsing text from " + file + " at " + pages + " pages ")
    object = PyPDF2.PdfFileReader(os.path.join(input_path, file))
    num_pages = object.getNumPages()
    for i in range(0, num_pages):
        page = object.getPage(i)
        text = page.extractText()
        text = text.replace('\n',' ')
        text = text.replace(str(i+1), "", 1) 
        cal_relevance(i, text)
        # with open(os.path.join(output_path, file.replace("pdf", "text")), "a") as f_obj:
        #     f_obj.write(text+"\n")

In [44]:
# with open("./text_files/2021-tesla-impact-report.text", "r") as f_obj:
#     text_list = f_obj.readlines()
# cal_tfidf(text_list)

# main

In [45]:
import os
import pandas as pd

input_path = "./pdf_files"
output_table_camelot = "./table_files_camelot"
output_table_tabula = "./table_files_tabula"
output_path_text = "./text_files"
pdf_files = os.listdir(input_path)
name = ["output_metric", "page"]
for file in pdf_files:
    # 1 parsing the table and text
    position_record_table = {key_word:[] for key_word in output_metric.keys()}
    position_record_text = {key_word:[] for key_word in output_metric.keys()}
    # parseTable_camelot(input_path, file, output_table_camelot, pages="all")
    # parseTable_tabula(input_path, file, output_table_tabula)
    parseText(input_path, file, output_path_text, "all")
    # 2 choose the top 3 pages based on the term frequency
    for keys in position_record_text.keys():
        position_record_text[keys] = [x[0] for x in sorted(position_record_text[keys], 
                                      key= lambda x: x[1], reverse=True)][:3]
    # 3 save the result
    position_record = {key_word:position_record_table[key_word]+position_record_text[key_word] 
                       for key_word in output_metric.keys()}
    position_record = {key_word:list(set(pages)) for key_word, pages in position_record.items()}
    position_matchs = [[key, " ".join(value)] for key, value in position_record.items()]
    data = pd.DataFrame(columns=name, data=position_matchs)
    data.to_csv("./data_extraction/" + file.replace(".pdf", "_text_tfidf.csv"))

parsing text from 2021-tesla-impact-report.pdf at all pages 
