In [4]:
!pip install PyPDF2
!pip install pycryptodome

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━[0m [32m204.8/232.6 kB[0m [31m6.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Collecting pycryptodome
  Downloading pycryptodome-3.20.0-cp35-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pycryptodome
Successfully installed pycryptodome-3.20.0


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Step 1

In [5]:
### Step 1: Extract text from PDf and store it in a database

from PyPDF2 import PdfReader
import pandas as pd
from nltk.tokenize import sent_tokenize
import nltk
nltk.download('punkt')

def tokenize_sentences(text):
    sentences = sent_tokenize(text)
    return sentences

def process_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    num_pages = len(reader.pages)

    # Initialize an empty DataFrame to store the results
    df_combined = pd.DataFrame(columns=['Sentences'])

    for page_number in range(1, num_pages + 1):
        # Extract text from each page
        page = reader.pages[page_number - 1]
        extracted_text = page.extract_text()

        # Tokenize sentences using NLTK
        sentences = tokenize_sentences(extracted_text)

        # Create a DataFrame with the sentences
        df_page = pd.DataFrame({'Sentences': sentences})

        # Concatenate the current page's DataFrame to the combined DataFrame
        df_combined = pd.concat([df_combined, df_page], ignore_index=True)

    return df_combined

# Specify the path for the PDF file
pdf_path = '/content/drive/MyDrive/2024/CAS/shell-sustainability-report-2022-final_unlocked_OCR.pdf'

# Process the PDF and get the combined DataFrame
df_result = process_pdf(pdf_path)

# Specify the path for the final CSV file
csv_path = '/content/drive/MyDrive/2024/CAS/MAIN_sentences_dataframe2022.csv'

# Save the combined DataFrame to a CSV file
df_result.to_csv(csv_path, index=False)

# Display the combined DataFrame
print(df_result.head())
print(f"Tokenized sentences have been saved to: {csv_path}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


                                           Sentences
0  Responsible \nenergyShell pic\nSustainability ...
1  Contents\nSustainability at Shell Powering liv...
2  The online version \nincludes additional infor...
3  In the event of any discrepancy \nbetween the ...
4  This hardcopy version is provided for \nthe re...
Tokenized sentences have been saved to: /content/drive/MyDrive/2024/CAS/MAIN_sentences_dataframe2022.csv


Step 2

In [6]:
###  Step 2: Truncate (shorten) the sentences to 200 words or less
import pandas as pd

def process_text(text):
    max_words_per_chunk = 200
    words = text.split()
    chunks = [words[i:i + max_words_per_chunk] for i in range(0, len(words), max_words_per_chunk)]
    return chunks

def process_csv(input_csv_path, output_csv_path):
    # Read the CSV file
    df = pd.read_csv(input_csv_path)

    # Create a new DataFrame for the updated data
    updated_data = []

    # Process each row in the first column
    for index, row in df.iterrows():
        text = str(row.iloc[0])  # Assuming the first column is the relevant one
        words_count = len(text.split())
        if words_count > 200:
            chunks = process_text(text)
            for chunk in chunks:
                updated_data.append([chunk])

    # Create a new DataFrame with the processed data
    updated_df = pd.DataFrame(updated_data, columns=[df.columns[0]])

    # Concatenate the original DataFrame and the updated DataFrame
    final_df = pd.concat([df, updated_df], ignore_index=True)

    # Save the updated DataFrame to a new CSV file
    final_df.to_csv(output_csv_path, index=False)

# Replace 'input.csv' and 'output.csv' with your actual file paths
process_csv('/content/drive/MyDrive/2024/CAS/MAIN_sentences_dataframe2022.csv', '/content/drive/MyDrive/2024/CAS/Truncated12022.csv')

Step 3


In [7]:
## Step 3 : Generate E, S and G score for the report

import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, pipeline
import torch

# Load the ESG scoring model from Hugging Face
model_name = "yiyanghkust/finbert-esg"
model = BertForSequenceClassification.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)
esg_pipeline = pipeline("text-classification", model=model, tokenizer=tokenizer)

# Function to get E, S, G scores for a sentence
def get_esg_scores(sentence):
    try:
        scores = esg_pipeline(sentence)
        return {score['label']: score['score'] for score in scores}
    except RuntimeError as e:
        print(f"Error for sentence. Skipping row.")
        return None

# Function to calculate average scores from a DataFrame
def calculate_average_scores(df):
    average_scores = {}
    for label in ['Environmental', 'Social', 'Governance']:
        column_name = f'{label}_Score'
        non_blank_entries = df[column_name].count()
        sum_scores = df[column_name].sum(skipna=True)
        average_score = sum_scores / non_blank_entries if non_blank_entries > 0 else 0
        average_scores[label] = average_score
    return average_scores

# Function to process CSV and calculate average scores
def process_csv_and_calculate_average(csv_path):
    # Load the CSV file
    df = pd.read_csv(csv_path)

    # Iterate over each row and update E, S, G scores
    for index, row in df.iterrows():
        sentence = row['Sentences']  # Replace with the actual column name
        scores = get_esg_scores(sentence)

        if scores is not None:
            for label, score in scores.items():
                df.at[index, f'{label}_Score'] = score

    # Calculate and return average scores
    return calculate_average_scores(df)

# Replace 'input.csv' with your actual file path
csv_path = '/content/drive/MyDrive/2024/CAS/Truncated12022.csv'
average_scores = process_csv_and_calculate_average(csv_path)

# Print average scores
print("Average E Score:", average_scores['Environmental'])
print("Average S Score:", average_scores['Social'])
print("Average G Score:", average_scores['Governance'])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/781 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Error for sentence. Skipping row.
Error for sentence. Skipping row.
Error for sentence. Skipping row.
Error for sentence. Skipping row.
Error for sentence. Skipping row.
Error for sentence. Skipping row.
Error for sentence. Skipping row.
Error for sentence. Skipping row.
Error for sentence. Skipping row.
Error for sentence. Skipping row.
Error for sentence. Skipping row.
Error for sentence. Skipping row.
Error for sentence. Skipping row.
Error for sentence. Skipping row.
Error for sentence. Skipping row.
Error for sentence. Skipping row.
Error for sentence. Skipping row.
Error for sentence. Skipping row.
Error for sentence. Skipping row.
Error for sentence. Skipping row.
Error for sentence. Skipping row.
Error for sentence. Skipping row.
Error for sentence. Skipping row.
Error for sentence. Skipping row.
Error for sentence. Skipping row.
Error for sentence. Skipping row.
Average E Score: 0.9424051227693033
Average S Score: 0.8876834494226119
Average G Score: 0.778675747367571
