In [None]:
get_ipython().system('pip install fitz')
get_ipython().system('pip install pymupdf')
get_ipython().system('pip install torch')
get_ipython().system('pip install git+https://github.com/huggingface/transformers accelerate')
get_ipython().system('pip install qwen-vl-utils')
get_ipython().system('pip install langdetect')

In [None]:
# Required libraries
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import requests
import fitz  # PyMuPDF
from langdetect import detect
import re
import pandas as pd
import time  # ⏰ Import the time module

# Track total execution time
overall_start = time.time()

# Step 1: Set up the model
model_start = time.time()
model_name = "Qwen/Qwen2.5-7B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    torch_dtype=torch.float16, 
    device_map="auto"
)

# Check if GPU is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("No GPU found, using CPU.")
    
print("✅ Model loaded successfully.")

# Step 2: Download the PDF from URL
pdf_url = "https://arxiv.org/pdf/2211.02001.pdf"
pdf_path = "sample_paper.pdf"
response = requests.get(pdf_url)
with open(pdf_path, 'wb') as f:
    f.write(response.content)
print("✅ PDF downloaded successfully.")

# Step 3: Extract text from multiple pages and split into chunks
def extract_text_from_pdf(pdf_path, max_pages=1, chunk_size=800):
    doc = fitz.open(pdf_path)
    text_chunks = []
    text = ""
    for i, page in enumerate(doc):
        if i >= max_pages:
            break
        text += page.get_text()
        # Split into chunks with context overlap for coherence
        while len(text) > chunk_size:
            text_chunks.append(text[:chunk_size + 300])  # Overlap for context
            text = text[chunk_size:]
    text_chunks.append(text)
    return text_chunks

text_chunks = extract_text_from_pdf(pdf_path)
print("✅ Text extracted and split into segments.")
print("Number of segments:", len(text_chunks))

# Step 4: Define the question
question = "How much CO2 is emitted during the training of language models according to the article?"

# Step 5: Iterate over text chunks and generate answers
all_responses = []

for i, chunk in enumerate(text_chunks):
    print(f"\n🔍 Analyzing Segment {i+1}/{len(text_chunks)}...")

    # Improved prompt for conciseness and focus on numbers
    input_text = (
    f"Context: {chunk}\n"
    f"Question: {question}\n"
    "Please provide a short, precise answer focusing on numerical data only. "
    "If no relevant information is found, reply with 'No data available'.\n"
)
    inputs = tokenizer(input_text, return_tensors="pt")
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    # Step 6: Generate an answer for the current chunk
    with torch.no_grad():
        generated_ids = model.generate(
            **inputs,
            max_new_tokens=32,  # Shorter answers for conciseness
            num_beams=5,    # Beam search for better quality
            temperature=0.3,
            no_repeat_ngram_size=2,
            early_stopping=True
        )
    
    # Decode the response and add to list
    response = tokenizer.decode(generated_ids[0], skip_special_tokens=True).strip()
    all_responses.append(response)
    
def extract_key_sentences(responses):
    key_sentences = []
    for response in responses:
        if detect(response) == 'en':
            sentences = response.split('. ')
            for sentence in sentences:
                if re.search(r'\b\d+(\.\d+)?\b', sentence):
                    if any(keyword in sentence.lower() for keyword in ["co2", "carbon dioxide", "emission", "tonnes", "kg","co2eq","carbon footprint","bloom","training"]):
                        sentence = re.sub(r'(Question|Output Format|Please provide).*?\. ', '', sentence, flags=re.IGNORECASE)
                        sentence = re.sub(r'\s+', ' ', sentence).strip()
                        key_sentences.append(sentence)
    return key_sentences

filtered_responses = extract_key_sentences(all_responses)

# Step 7: Summarize the answers

def summarize_responses(responses):
    summary_text = " ".join(responses)
    summary_prompt = (
        f"Summarize the following text in about 100 words, focusing on CO2 emissions and numerical data:\n"
        f"{summary_text}\n"
    )
    summary_inputs = tokenizer(summary_prompt, return_tensors="pt")
    with torch.no_grad():
        summary_ids = model.generate(
            **summary_inputs,
            max_new_tokens=150,
            num_beams=5,
            temperature=0.3,
            no_repeat_ngram_size=2,
            early_stopping=True
        )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True).strip()
    summary = re.sub(r'(Question|Output Format|Please provide).*?\. ', '', summary, flags=re.IGNORECASE)
    summary = re.sub(r'\s+', ' ', summary).strip()
    return summary

summary = summarize_responses(filtered_responses)

print("\n✅ 100-Word Summary of Main Facts:")
print(summary)

# Step 8: Display Top 10 Answers

def get_top_10_responses(responses):
    unique_responses = list(dict.fromkeys(responses))  # Remove duplicates
    sorted_responses = sorted(unique_responses, key=lambda x: len(re.findall(r'\b\d+(\.\d+)?\b', x)), reverse=True)
    return sorted_responses[:10]

top_10_answers = get_top_10_responses(filtered_responses)
print("\n✅ Clean Top 10 Relevant Sentences:")
for sentence in top_10_answers:
    print("-", sentence)

# End timing the entire script
overall_end = time.time()

# Calculate and display total execution time
total_duration = overall_end - overall_start
print(f"\n⏱️ Total execution time: {total_duration / 60:.2f} minutes")

In [None]:
# List of model names to look for (modify this as needed)
model_names = [
    "BLOOM",
    "GPT-3",
    "T5",
    "BERT",
    "XLNet",
    "RoBERTa",
    "GPT-2",
    "Transformer-XL",
    "Albert",
    "Megatron"
]

# Enhanced function to extract model names and ALL CO2 emissions from the responses
def extract_model_emissions(responses, model_names):
    model_emissions = []

    # Improved pattern to capture multiple emissions for the same model
    pattern = re.compile(
        r"(?P<model>" + "|".join(model_names) + r").*?(?:approximately|around|about)?\s?(?P<emission>\d+(\.\d+)?)\s?(?:tonnes|tonne|t)\s?(?:CO2|CO2eq|CO₂)?",
        re.IGNORECASE
    )

    for response in responses:
        # Split sentences manually using regex
        sentences = re.split(r'(?<=[.!?]) +', response)
        for sentence in sentences:
            # Check if any model name is in the sentence
            if any(model in sentence for model in model_names):
                print(f"\n🔎 Analyzing Sentence: {sentence}")
                
                # Find all emissions for the same model
                matches = pattern.findall(sentence)
                if matches:
                    for match in matches:
                        model = match[0]  # Model name
                        emission = float(match[1])  # CO2 emission value
                        model_emissions.append({
                            "Model Name": model,
                            "CO2 Emission": emission,
                            "Context": sentence
                        })
                        print(f"✅ Model Found: {model}")
                        print(f"📏 Emission Found: {emission} for Model: {model}")
                else:
                    print("⚠️ Model mentioned but no emissions found.")
    
    return model_emissions

# Extract model emissions from the filtered responses
model_emissions_list = extract_model_emissions(filtered_responses, model_names)

# Create DataFrame from the extracted data
df_emissions = pd.DataFrame(model_emissions_list)

# Display the DataFrame
print("\n📊 CO2 Emissions by Model with Context:")
print(df_emissions)