In [1]:
pip install transformers torch

Note: you may need to restart the kernel to use updated packages.


In [2]:
!pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=45737f1b96e987702fd36e7a89d6dd3ca2c5132baefce3c5e3643cc9ac05b966
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [3]:
from tqdm import tqdm
from transformers import pipeline
import os
import pandas as pd
import torch

# Check if GPU is available
device = 0 if torch.cuda.is_available() else -1
print("Using GPU" if device == 0 else "Using CPU")

# Initialize the summarization pipeline with GPU
summarizer = pipeline("summarization", model="allenai/led-base-16384", device=device)

# Define the dataset path
dataset_path = r'/kaggle/input/cleaned_ECTs_dataset'

# Placeholder to store summaries
summaries = {}

# Function to split text into chunks
def split_text(text, max_chunk_size=500):
    words = text.split()
    return [" ".join(words[i:i + max_chunk_size]) for i in range(0, len(words), max_chunk_size)]

# Process all transcripts in batches
batch_size = 8  # Adjust batch size as needed for GPU memory
files_to_process = []

Using GPU


config.json:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/648M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Device set to use cuda:0


In [4]:
files_to_process = []
for company in os.listdir(dataset_path):
    company_path = os.path.join(dataset_path, company)
    if os.path.isdir(company_path):
        for transcript_file in sorted(os.listdir(company_path)):
            # Only include .txt files (adjust if your files have a different extension)
            if transcript_file.endswith(".txt"):
                file_path = os.path.join(company_path, transcript_file)
                files_to_process.append((company, transcript_file, file_path))
print("Number of transcript files:", len(files_to_process))

# Process transcripts in batches
batch_size = 8  # Adjust based on your GPU memory
with tqdm(total=len(files_to_process), desc="Processing Transcripts") as pbar:
    for i in range(0, len(files_to_process), batch_size):
        batch = files_to_process[i:i + batch_size]
        batch_texts = []
        batch_meta = []
        
        for company, transcript_file, file_path in batch:
            with open(file_path, 'r', encoding='utf-8') as file:
                transcript_content = file.read()
                # Split the transcript into manageable chunks
                chunks = split_text(transcript_content, max_chunk_size=500)
                batch_texts.extend(chunks)
                batch_meta.extend([(company, transcript_file)] * len(chunks))
        
        if batch_texts:
            # Generate summaries for each chunk in the batch
            batch_summaries = summarizer(batch_texts, max_length=150, min_length=50, do_sample=False)
            # Combine summaries per file
            for (company, transcript_file), summary in zip(batch_meta, batch_summaries):
                if company not in summaries:
                    summaries[company] = {}
                if transcript_file not in summaries[company]:
                    summaries[company][transcript_file] = ""
                summaries[company][transcript_file] += " " + summary['summary_text']
        
        pbar.update(len(batch))

# Convert the summaries dictionary to a DataFrame and save as CSV
summaries_df = pd.DataFrame.from_dict(summaries, orient='index').stack().reset_index()
summaries_df.columns = ["Company", "Transcript", "Summary"]
output_csv = '/kaggle/working/ECT_summaries.csv'
summaries_df.to_csv(output_csv, index=False)
print("Summaries saved to", output_csv)

Number of transcript files: 1185


Processing Transcripts:   0%|          | 0/1185 [00:00<?, ?it/s]Input ids are automatically padded from 595 to 1024 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 579 to 1024 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 623 to 1024 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 644 to 1024 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 606 to 1024 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 578 to 1024 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 574 to 1024 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 589 to 1024 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 614 to 1024 to be a multiple of `config.attention_window`: 1024


Summaries saved to /kaggle/working/ECT_summaries.csv


In [13]:
import pandas as pd
import os
from rouge_score import rouge_scorer
from tqdm import tqdm

# Load generated summaries CSV
generated_summaries_df = pd.read_csv('/kaggle/working/ECT_summaries.csv')

# Define dataset path to original transcripts
dataset_path = r'/kaggle/input/cleaned_ECTs_dataset'

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

# Placeholder for results
rouge_scores = []

# Function to read original transcript text
def read_original_transcript(company, quarter):
    company_path = os.path.join(dataset_path, company)
    file_path = os.path.join(company_path, quarter)
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

# Iterate over each summary in the DataFrame
for _, row in tqdm(generated_summaries_df.iterrows(), total=len(generated_summaries_df)):
    company = row['Company']
    quarter = row['Transcript']
    generated_summary = row['Summary']

    # Read the corresponding original transcript
    try:
        original_transcript = read_original_transcript(company, quarter)
        # Calculate ROUGE scores
        scores = scorer.score(original_transcript, generated_summary)
        rouge_scores.append({
            'Company': company,
            'Transcript': quarter,
            'ROUGE-1': scores['rouge1'].fmeasure,
            'ROUGE-L': scores['rougeL'].fmeasure
        })
    except FileNotFoundError:
        print(f"Transcript for {company} {quarter} not found.")
        rouge_scores.append({
            'Company': company,
            'Transcript': quarter,
            'ROUGE-1': None,
            'ROUGE-L': None
        })

# Convert ROUGE scores to a DataFrame
rouge_scores_df = pd.DataFrame(rouge_scores)

# Calculate the overall average ROUGE-1 and ROUGE-L scores
overall_rouge_1 = rouge_scores_df['ROUGE-1'].mean()
overall_rouge_l = rouge_scores_df['ROUGE-L'].mean()

# Print overall ROUGE scores
print(f"Overall ROUGE-1 F1 Score: {overall_rouge_1:.4f}")
print(f"Overall ROUGE-L F1 Score: {overall_rouge_l:.4f}")

# Save detailed results to CSV
rouge_scores_df.to_csv('/kaggle/working/rouge_scores.csv', index=False)
print("Detailed ROUGE scores saved to rouge_scores.csv")

100%|██████████| 1185/1185 [4:06:13<00:00, 12.47s/it]   

Overall ROUGE-1 F1 Score: 0.3936
Overall ROUGE-L F1 Score: 0.3873
Detailed ROUGE scores saved to rouge_scores.csv



