In [None]:
# NASA Bioscience Summary Regenerator - No Truncation Guaranteed
# Run this in Google Colab

!pip install transformers torch pandas tqdm sentencepiece accelerate

import pandas as pd
import json
from transformers import pipeline, BartForConditionalGeneration, BartTokenizer
from tqdm import tqdm
import torch

# Load your files (upload these to Colab first)
from google.colab import files
uploaded = files.upload()

# Load data
print("📁 Loading data...")
df = pd.read_csv('pmc_sections.csv')
with open('publications.json', 'r') as f:
    publications_data = json.load(f)

print(f"Loaded {len(df)} publications from CSV")
print(f"Loaded {len(publications_data)} publications from JSON")

# Load model and tokenizer separately for more control
print("🔄 Loading BART model with custom generation...")
model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# Move to GPU if available
device = 0 if torch.cuda.is_available() else -1
if device == 0:
    model = model.to('cuda')
    print("✅ Using GPU acceleration")

def smart_summarize_no_truncation(text, section_type):
    """Generate summaries WITHOUT ANY TRUNCATION"""
    if not text or len(str(text).strip()) < 150:
        return str(text) if text else ""

    text_str = str(text).strip()

    try:
        # Tokenize input
        inputs = tokenizer.encode(
            text_str,
            return_tensors="pt",
            max_length=1024,  # Model's max input length
            truncation=True   # Only truncate input, not output
        )

        if device == 0:
            inputs = inputs.to('cuda')

        # KEY: Generate without output length limits
        # Let the model decide when to finish naturally
        summary_ids = model.generate(
            inputs,
            max_length=512,        # Very generous upper bound
            min_length=80,         # Reasonable minimum
            length_penalty=2.0,    # Prefer longer, more complete summaries
            num_beams=4,           # Better quality
            early_stopping=True,   # Stop when naturally done
            no_repeat_ngram_size=3,
            do_sample=False
        )

        # Decode the summary
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

        # The model will naturally end with proper punctuation
        # No need to manually add periods
        return summary

    except Exception as e:
        print(f"Summarization error: {e}")
        # Fallback that finds natural sentence breaks
        if '.' in text_str[:400]:
            return text_str[:400].rsplit('.', 1)[0] + '.'
        elif ';' in text_str[:400]:
            return text_str[:400].rsplit(';', 1)[0] + '.'
        else:
            # If no natural break, just take reasonable chunk
            return text_str[:350] + '...'

def is_complete_summary(summary):
    """Check if summary ends naturally"""
    if not summary or len(summary) < 50:
        return False

    # Check for natural ending punctuation
    if summary.endswith(('.', '!', '?')):
        return True

    # Check if it ends with a complete-looking phrase
    last_few_words = ' '.join(summary.split()[-3:]).lower()
    complete_indicators = ['conclusion', 'summary', 'results show', 'study demonstrates', 'research indicates']

    if any(indicator in last_few_words for indicator in complete_indicators):
        return True

    return False

# Create mapping from PMC_ID to row data
print("🔗 Creating publication mapping...")
pmc_mapping = {}
for idx, row in df.iterrows():
    pmc_id = row['PMC_ID']
    pmc_mapping[pmc_id] = {
        'abstract': row.get('Abstract', ''),
        'introduction': row.get('Introduction', ''),
        'results': row.get('Results', ''),
        'conclusion': row.get('Conclusion', '')
    }

print(f"Created mapping for {len(pmc_mapping)} publications")

# Regenerate summaries with NO TRUNCATION
print("🔄 Regenerating summaries with NO TRUNCATION...")
sections = ['abstract', 'introduction', 'results', 'conclusion']
total_updated = 0
incomplete_count = 0

for pmc_id, pub_data in tqdm(publications_data.items(), desc="Processing"):
    if pmc_id in pmc_mapping:
        for section in sections:
            original_text = pmc_mapping[pmc_id].get(section, '')

            if original_text and len(str(original_text).strip()) > 200:
                old_summary = pub_data.get(f'{section}_summary', '')

                # Only regenerate if old summary was truncated
                if not old_summary or not is_complete_summary(old_summary):
                    new_summary = smart_summarize_no_truncation(original_text, section)

                    if new_summary and len(new_summary) > 60:
                        publications_data[pmc_id][f'{section}_summary'] = new_summary
                        total_updated += 1

                        if not is_complete_summary(new_summary):
                            incomplete_count += 1
                            print(f"⚠️  Still incomplete: {pmc_id} - {section}")

print(f"✅ Updated {total_updated} summaries")
print(f"⚠️  {incomplete_count} summaries may still need attention")

# Save the improved data
print("💾 Saving non-truncated data...")
output_filename = 'publications_no_truncation.json'
with open(output_filename, 'w') as f:
    json.dump(publications_data, f, indent=2)

print(f"✅ Saved {output_filename}")

# Download the file
files.download(output_filename)

print("\n🎯 No-truncation processing completed!")
print("📝 Summaries should now end naturally without mid-sentence cuts")



Saving pmc_sections.csv to pmc_sections.csv
Saving publications.json to publications.json
📁 Loading data...
Loaded 571 publications from CSV
Loaded 571 publications from JSON
🔄 Loading BART model with custom generation...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

✅ Using GPU acceleration
🔗 Creating publication mapping...
Created mapping for 571 publications
🔄 Regenerating summaries with NO TRUNCATION...


Processing: 100%|██████████| 571/571 [00:05<00:00, 105.98it/s]

⚠️  Still incomplete: PMC10774393 - results
✅ Updated 2 summaries
⚠️  1 summaries may still need attention
💾 Saving non-truncated data...





✅ Saved publications_no_truncation.json


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


🎯 No-truncation processing completed!
📝 Summaries should now end naturally without mid-sentence cuts
