In [None]:
!pip install PyPDF2
!pip install pycryptodome

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m225.3/232.6 kB[0m [31m7.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Collecting pycryptodome
  Downloading pycryptodome-3.20.0-cp35-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pycryptodome
Successfully installed pycryptodome-3.20.0


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Step 1

In [None]:
### Step 1: Extract text from PDf and store it in a database

from PyPDF2 import PdfReader
import pandas as pd
from nltk.tokenize import sent_tokenize
import nltk
nltk.download('punkt')

def tokenize_sentences(text):
    sentences = sent_tokenize(text)
    return sentences

def process_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    num_pages = len(reader.pages)

    # Initialize an empty DataFrame to store the results
    df_combined = pd.DataFrame(columns=['Sentences'])

    for page_number in range(1, num_pages + 1):
        # Extract text from each page
        page = reader.pages[page_number - 1]
        extracted_text = page.extract_text()

        # Tokenize sentences using NLTK
        sentences = tokenize_sentences(extracted_text)

        # Create a DataFrame with the sentences
        df_page = pd.DataFrame({'Sentences': sentences})

        # Concatenate the current page's DataFrame to the combined DataFrame
        df_combined = pd.concat([df_combined, df_page], ignore_index=True)

    return df_combined

# Specify the path for the PDF file
pdf_path = '/content/drive/MyDrive/Colab Notebooks/shell-sustainability-report-2020_unlocked.pdf'

# Process the PDF and get the combined DataFrame
df_result = process_pdf(pdf_path)

# Specify the path for the final CSV file
csv_path = '/content/drive/MyDrive/Colab Notebooks/shell_2020.csv'

# Save the combined DataFrame to a CSV file
df_result.to_csv(csv_path, index=False)

# Display the combined DataFrame
print(df_result.head())
print(f"Tokenized sentences have been saved to: {csv_path}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


                                           Sentences
0  SUSTAINABILITY REPORT 2020\nROYAL DUTCH SHELL ...
1  Our strategy to accelerate the \ntransition to...
2  Powering Progress is designed to create value ...
3  The \nstrategy seeks to accelerate Shell’s tra...
4  Powering Progress is designed to create value ...
Tokenized sentences have been saved to: /content/drive/MyDrive/Colab Notebooks/shell_2020.csv


Step 2

In [None]:
###  Step 2: Truncate (shorten) the sentences to 200 words or less
import pandas as pd

def process_text(text):
    max_words_per_chunk = 200
    words = text.split()
    chunks = [words[i:i + max_words_per_chunk] for i in range(0, len(words), max_words_per_chunk)]
    return chunks

def process_csv(input_csv_path, output_csv_path):
    # Read the CSV file
    df = pd.read_csv(input_csv_path)

    # Create a new DataFrame for the updated data
    updated_data = []

    # Process each row in the first column
    for index, row in df.iterrows():
        text = str(row.iloc[0])  # Assuming the first column is the relevant one
        words_count = len(text.split())
        if words_count > 200:
            chunks = process_text(text)
            for chunk in chunks:
                updated_data.append([chunk])

    # Create a new DataFrame with the processed data
    updated_df = pd.DataFrame(updated_data, columns=[df.columns[0]])

    # Concatenate the original DataFrame and the updated DataFrame
    final_df = pd.concat([df, updated_df], ignore_index=True)

    # Save the updated DataFrame to a new CSV file
    final_df.to_csv(output_csv_path, index=False)

# Replace 'input.csv' and 'output.csv' with your actual file paths
process_csv('/content/drive/MyDrive/Colab Notebooks/shell_2020.csv', '/content/drive/MyDrive/Colab Notebooks/shell_2020_output.csv')

Step 3


In [None]:
## Step 3 : Generate E, S and G score for the report

import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, pipeline
import torch

# Load the ESG scoring model from Hugging Face
model_name = "yiyanghkust/finbert-esg"
model = BertForSequenceClassification.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)
esg_pipeline = pipeline("text-classification", model=model, tokenizer=tokenizer)

# Function to get E, S, G scores for a sentence
def get_esg_scores(sentence):
    try:
        scores = esg_pipeline(sentence)
        return {score['label']: score['score'] for score in scores}
    except RuntimeError as e:
        print(f"Error for sentence. Skipping row.")
        return None

# Function to calculate average scores from a DataFrame
def calculate_average_scores(df):
    average_scores = {}
    for label in ['Environmental', 'Social', 'Governance']:
        column_name = f'{label}_Score'
        non_blank_entries = df[column_name].count()
        sum_scores = df[column_name].sum(skipna=True)
        average_score = sum_scores / non_blank_entries if non_blank_entries > 0 else 0
        average_scores[label] = average_score
    return average_scores

# Function to process CSV and calculate average scores
def process_csv_and_calculate_average(csv_path):
    # Load the CSV file
    df = pd.read_csv(csv_path)

    # Iterate over each row and update E, S, G scores
    for index, row in df.iterrows():
        sentence = row['Sentences']  # Replace with the actual column name
        scores = get_esg_scores(sentence)

        if scores is not None:
            for label, score in scores.items():
                df.at[index, f'{label}_Score'] = score

    # Calculate and return average scores
    return calculate_average_scores(df)

# Replace 'input.csv' with your actual file path
csv_path = '/content/drive/MyDrive/Colab Notebooks/shell_2020_output.csv'
average_scores = process_csv_and_calculate_average(csv_path)

# Print average scores
print("Average E Score:", average_scores['Environmental'])
print("Average S Score:", average_scores['Social'])
print("Average G Score:", average_scores['Governance'])


Error for sentence. Skipping row.
Error for sentence. Skipping row.
Error for sentence. Skipping row.
Error for sentence. Skipping row.
Error for sentence. Skipping row.
Error for sentence. Skipping row.
Error for sentence. Skipping row.
Error for sentence. Skipping row.
Error for sentence. Skipping row.
Error for sentence. Skipping row.
Error for sentence. Skipping row.
Error for sentence. Skipping row.
Error for sentence. Skipping row.
Error for sentence. Skipping row.
Error for sentence. Skipping row.
Error for sentence. Skipping row.
Error for sentence. Skipping row.
Error for sentence. Skipping row.
Error for sentence. Skipping row.
Error for sentence. Skipping row.
Error for sentence. Skipping row.
Error for sentence. Skipping row.
Error for sentence. Skipping row.
Error for sentence. Skipping row.
Error for sentence. Skipping row.
Error for sentence. Skipping row.
Error for sentence. Skipping row.
Error for sentence. Skipping row.
Error for sentence. Skipping row.
Error for sent

Step 4 : Identifying main topics and trends using techniques like LDA (Latent Dirichlet Allocation).

In [None]:
from bertopic import BERTopic
import pandas as pd

# Load the model
topic_model = BERTopic.load("MaartenGr/BERTopic_Wikipedia")

# Get topic information
topic_info = topic_model.get_topic_info()

# Convert to pandas DataFrame
df2 = pd.DataFrame(topic_info)

##..................
import pandas as pd
from bertopic import BERTopic

# Load the BERTopic model
topic_model = BERTopic.load("MaartenGr/BERTopic_Wikipedia")

def get_topic_info(topic_model, sentence):
    # Predict topic assignments for the sentence
    z = topic_model.transform(sentence)
    # Get topic number and confidence level
    topic_number = z[0][0]
    confidence_level = z[1][0]
    # Get topic name
    topic_name = df2.loc[df2['Topic'] == topic_number, 'Name'].iloc[0]
    return topic_number, confidence_level, topic_name

# Read the CSV file
input_csv_file = '/content/drive/MyDrive/2024/CAS/Truncated1.csv'  # Replace with your input CSV file
output_csv_file = '/content/drive/MyDrive/2024/CAS/outputLDA.csv'  # Specify the output CSV file

df = pd.read_csv(input_csv_file)

# Iterate over each row (excluding the header)
for index, row in df.iterrows():
    if index == 0:  # Skip the header row
        continue
    # Get the sentence from the first column
    sentence = row['Sentences']
    # Get topic info
    topic_number, confidence_level, topic_name = get_topic_info(topic_model, sentence)
    # Store topic info in new columns
    df.at[index, 'Topic Number'] = topic_number
    df.at[index, 'Confidence Level'] = confidence_level
    df.at[index, 'Topic Name'] = topic_name

# Save the DataFrame to a new CSV file
df.to_csv(output_csv_file, index=False)

Step 5.1 : Calculate the frequencies of "Topic Name" and sort them

In [None]:
# Up Next, Calculate the frequencies of "Name" and sort them
import pandas as pd

# Read the CSV file
input_csv_file = 'outputLDA.csv'  # Replace with your input CSV file
df = pd.read_csv("/content/drive/MyDrive/2024/CAS/outputLDA.csv")

# Calculate the frequency of occurrence of each unique entry in the "Topic Name" column
topic_name_counts = df['Topic Name'].value_counts()

# Get the top 5 frequencies
top_2_topics = topic_name_counts.head(2)

# Print the top 5 frequencies along with the corresponding values in the "Confidence Level" column
print("Top 2 Topic Name Frequencies with Corresponding Confidence Levels:")
for topic_name, frequency in top_5_topics.items():
    confidence_level = df[df['Topic Name'] == topic_name]['Confidence Level'].iloc[0]
    print(f"Topic Name: {topic_name}, Frequency: {frequency}, Confidence Level: {confidence_level}")

Step 5.2: Store the most relevant "Topic names" in a single Python list

In [None]:
all_words = []

# Iterate over each topic name in the top 5 topics
for topic_name, frequency in top_5_topics.items():
    # Split the topic name based on the "_" delimiter and filter out numerical values
    words = [word for word in topic_name.split("_") if not word.isdigit()]
    # Extend the list of all words with the words from the current topic name
    all_words.extend(words)

# Print the merged list of all words
print("Merged List of All Words:")
print(all_words)

Step 6: Thematic Analysis

In [None]:
import pandas as pd
from transformers import pipeline

# Load the zero-shot classification pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Read the CSV file
input_csv_file = '/content/drive/MyDrive/2024/CAS/Truncated2021.csv'  # Replace with your input CSV file
output_csv_file = '/content/drive/MyDrive/2024/CAS/Thematic2021.csv'  # Specify the output CSV file

# List of keywords


# Initialize an empty list to store the results
results = []

# Read the CSV file and iterate over each row
df = pd.read_csv(input_csv_file)
count = 1
for index, row in df.iterrows():
    # Get the sentence from the "Sentences" column
    sentence = row['Sentences']

    # Perform zero-shot classification
    scores = classifier(sentence, all_words)['scores']

    # Store the scores in a dictionary
    result = {'Sentence': sentence}
    for label, score in zip(all_words, scores):
        result[label] = score

    # Append the result to the list
    results.append(result)

    # Print the scores for each keyword
    print ("SENTENCE NO. ", count)
    print(f"Sentence: {sentence}")
    for label, score in zip(all_words, scores):
        print(f"{label}: {score}")
    count+=1
# Convert the list of results to a DataFrame
output_df = pd.DataFrame(results)

# Save the DataFrame to a new CSV file
output_df.to_csv(output_csv_file, index=False)