In [1]:
pip install transformers torch

Note: you may need to restart the kernel to use updated packages.


In [9]:
!pip install rouge-score

  pid, fd = os.forkpty()


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=e4f0d41b90fc1a3bb4e707758dd0e2616a330231110d046319ae4ab5623acb58
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [6]:
from tqdm import tqdm
from transformers import pipeline
import os
import pandas as pd
import torch

# Check if GPU is available
device = 0 if torch.cuda.is_available() else -1
print("Using GPU" if device == 0 else "Using CPU")

# Initialize the summarization pipeline with GPU
summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=device)

# Define the dataset path
dataset_path = r'/kaggle/input/earning-call-transcripts/cleaned_ECTs_dataset'

# Placeholder to store summaries
summaries = {}

# Function to split text into chunks
def split_text(text, max_chunk_size=500):
    words = text.split()
    return [" ".join(words[i:i + max_chunk_size]) for i in range(0, len(words), max_chunk_size)]

# Process all transcripts in batches
batch_size = 8  # Adjust batch size as needed for GPU memory
files_to_process = []

Using GPU


In [7]:
# Collect all files
for company in os.listdir(dataset_path):
    company_path = os.path.join(dataset_path, company)
    if os.path.isdir(company_path):
        for transcript_file in sorted(os.listdir(company_path)):
            file_path = os.path.join(company_path, transcript_file)
            files_to_process.append((company, transcript_file, file_path))

# Process in batches
with tqdm(total=len(files_to_process), desc="Processing Transcripts") as pbar:
    for i in range(0, len(files_to_process), batch_size):
        batch = files_to_process[i:i + batch_size]
        batch_texts = []
        batch_meta = []

        for company, transcript_file, file_path in batch:
            with open(file_path, 'r', encoding='utf-8') as file:
                transcript_content = file.read()
                chunks = split_text(transcript_content)
                batch_texts.extend(chunks)
                batch_meta.extend([(company, transcript_file)] * len(chunks))

        # Generate summaries for the batch
        batch_summaries = summarizer(batch_texts, max_length=150, min_length=50, do_sample=False)

        # Store summaries in the dictionary
        for (company, transcript_file), summary in zip(batch_meta, batch_summaries):
            if company not in summaries:
                summaries[company] = {}
            if transcript_file not in summaries[company]:
                summaries[company][transcript_file] = ""
            summaries[company][transcript_file] += " " + summary['summary_text']
        
        # Update the progress bar
        pbar.update(len(batch))

# Convert summaries to a DataFrame and save as CSV
summaries_df = pd.DataFrame.from_dict(summaries, orient='index').stack().reset_index()
summaries_df.columns = ["Company", "Quarter", "Summary"]
summaries_df.to_csv('/kaggle/working/ECT_summaries.csv', index=False)
print("Summaries saved to ECT_summaries.csv")

Processing Transcripts:   0%|          | 0/1185 [00:00<?, ?it/s]Your max_length is set to 150, but your input_length is only 72. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=36)
Your max_length is set to 150, but your input_length is only 129. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=64)
Your max_length is set to 150, but your input_length is only 61. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=30)
Your max_length is set to 150, but your input_length is only 50. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasin

Summaries saved to ECT_summaries.csv


In [10]:
import pandas as pd
import os
from rouge_score import rouge_scorer
from tqdm import tqdm

# Load generated summaries CSV
generated_summaries_df = pd.read_csv('/kaggle/working/ECT_summaries.csv')

# Define dataset path to original transcripts
dataset_path = r'/kaggle/input/earning-call-transcripts/cleaned_ECTs_dataset'

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

# Placeholder for results
rouge_scores = []

# Function to read original transcript text
def read_original_transcript(company, quarter):
    company_path = os.path.join(dataset_path, company)
    file_path = os.path.join(company_path, quarter)
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

# Iterate over each summary in the DataFrame
for _, row in tqdm(generated_summaries_df.iterrows(), total=len(generated_summaries_df)):
    company = row['Company']
    quarter = row['Quarter']
    generated_summary = row['Summary']

    # Read the corresponding original transcript
    try:
        original_transcript = read_original_transcript(company, quarter)
        # Calculate ROUGE scores
        scores = scorer.score(original_transcript, generated_summary)
        rouge_scores.append({
            'Company': company,
            'Quarter': quarter,
            'ROUGE-1': scores['rouge1'].fmeasure,
            'ROUGE-L': scores['rougeL'].fmeasure
        })
    except FileNotFoundError:
        print(f"Transcript for {company} {quarter} not found.")
        rouge_scores.append({
            'Company': company,
            'Quarter': quarter,
            'ROUGE-1': None,
            'ROUGE-L': None
        })

# Convert ROUGE scores to a DataFrame
rouge_scores_df = pd.DataFrame(rouge_scores)

# Calculate the overall average ROUGE-1 and ROUGE-L scores
overall_rouge_1 = rouge_scores_df['ROUGE-1'].mean()
overall_rouge_l = rouge_scores_df['ROUGE-L'].mean()

# Print overall ROUGE scores
print(f"Overall ROUGE-1 F1 Score: {overall_rouge_1:.4f}")
print(f"Overall ROUGE-L F1 Score: {overall_rouge_l:.4f}")

# Save detailed results to CSV
rouge_scores_df.to_csv('/kaggle/working/rouge_scores.csv', index=False)
print("Detailed ROUGE scores saved to rouge_scores.csv")

100%|██████████| 1185/1185 [2:52:12<00:00,  8.72s/it]  

Overall ROUGE-1 F1 Score: 0.2090
Overall ROUGE-L F1 Score: 0.1994
Detailed ROUGE scores saved to rouge_scores.csv



