In [None]:
!pip install -q evaluate rouge-score

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from evaluate import load as load_metric
import pandas as pd
import nltk

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("✅ Device:", device)

✅ Device: cuda


In [None]:
# Load dataset
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
texts = [x['text'].strip() for x in dataset if len(x['text'].strip()) > 100][:100]


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/733k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/6.36M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

In [None]:
# Use ROUGE instead of BLEU
rouge = load_metric("rouge")

def generate_text(model, tokenizer, prompt, max_length=50):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]

    # Pad token handling
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token_id = tokenizer.eos_token_id

    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=max_length,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def calculate_perplexity(model, tokenizer, texts):
    encodings = tokenizer("\n\n".join(texts), return_tensors="pt", truncation=True, max_length=1024)
    input_ids = encodings.input_ids.to(device)
    with torch.no_grad():
        outputs = model(input_ids, labels=input_ids)
        loss = outputs.loss
        return torch.exp(loss).item()


Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
# Model list
models = {
    "GPT-2": "gpt2",
    "TinyLLaMA": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    "GPT-Neo": "EleutherAI/gpt-neo-125M",
    "Falcon": "tiiuae/falcon-rw-1b",
    "TinyGPT": "sshleifer/tiny-gpt2"
}


In [None]:
results = []
prompts = [
    "Once upon a time in a faraway land,",
    "The future of artificial intelligence is",
    "In the middle of the dark forest,"
]

for name, model_id in models.items():
    print(f"\n🚀 Loading: {name}")
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
        model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True).to(device)
        model.eval()

        if tokenizer.pad_token_id is None:
            tokenizer.pad_token_id = tokenizer.eos_token_id

        # Generate samples
        generations = []
        for prompt in prompts:
            try:
                text = generate_text(model, tokenizer, prompt, max_length=50)
                generations.append(text)
            except Exception as e:
                generations.append("ERROR")

        # Perplexity
        try:
            perplexity = calculate_perplexity(model, tokenizer, texts[:20])
        except:
            perplexity = "Error"

        # ROUGE Score (compared to 3 references)
        references = [text.strip() for text in texts[10:13]]
        try:
            rouge_score = rouge.compute(predictions=generations, references=references)['rougeL']
        except:
            rouge_score = "Error"

        results.append({
            "Model": name,
            "Perplexity": perplexity,
            "ROUGE-L": rouge_score,
            "Sample 1": generations[0],
            "Sample 2": generations[1],
            "Sample 3": generations[2],
        })

    except Exception as e:
        print(f"❌ Error loading model {name}: {e}")
        results.append({
            "Model": name,
            "Perplexity": "Load Failed",
            "ROUGE-L": "Load Failed",
            "Sample 1": "N/A",
            "Sample 2": "N/A",
            "Sample 3": "N/A",
        })


🚀 Loading: GPT-2


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.



🚀 Loading: TinyLLaMA


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]


🚀 Loading: GPT-Neo


tokenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/526M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]


🚀 Loading: Falcon


tokenizer_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

configuration_falcon.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/tiiuae/falcon-rw-1b:
- configuration_falcon.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_falcon.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/tiiuae/falcon-rw-1b:
- modeling_falcon.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


pytorch_model.bin:   0%|          | 0.00/2.62G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/115 [00:00<?, ?B/s]


🚀 Loading: TinyGPT


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.51M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.51M [00:00<?, ?B/s]

In [None]:
# Display results
df = pd.DataFrame(results)
pd.set_option("display.max_colwidth", None)


In [None]:
print("\n📊 Model Evaluation Summary:")
display(df[["Model", "Perplexity", "ROUGE-L"]])



📊 Model Evaluation Summary:


Unnamed: 0,Model,Perplexity,ROUGE-L
0,GPT-2,15.941955,0.119892
1,TinyLLaMA,5.272264,0.099534
2,GPT-Neo,14.862522,0.124668
3,Falcon,7.643232,0.0
4,TinyGPT,50329.824219,0.05119


In [None]:
print("\n📝 Sample Generations:")
display(df[["Model", "Sample 1", "Sample 2", "Sample 3"]])


📝 Sample Generations:


Unnamed: 0,Model,Sample 1,Sample 2,Sample 3
0,GPT-2,"Once upon a time in a faraway land, there is a great, vast white lake called the Red Lake. The Red is a lake in the center of the entire western hemisphere; it's about to flood out. The lake was named, at","The future of artificial intelligence is a multi-faceted, complex topic of debate and debate, and AI has one very clear target on its mind: to change that world for the better.\n\n\nI have a lot of questions about a lot","In the middle of the dark forest, a huge wolf, not too far from where the villagers were standing.\n\nThis is the wolf who has come and attacked my village. What a stupid dog!""\n\nBut, the wolf didn't kill"
1,TinyLLaMA,"Once upon a time in a faraway land, there lived an enchanted woman known as Elsa. Her heart was pure gold and her magic…\nElizabeth: I've always been a bit different. But it wasn'","The future of artificial intelligence is not as dystopian as you might think, with opportunities for AI to enhance healthcare, safety, and education. Artificial Intelligence in Healthcare Healthcare organizations are exploring the","In the middle of the dark forest, a solitary figure sat, staring at the trees around him. The air was thick with the scent of pine and earth, and the rustling of leaves was the only sound that broke the"
2,GPT-Neo,"Once upon a time in a faraway land, an enemy who stood at the gate of a sacred place (like a great warrior) stood there, a man of about twelve, perhaps even twelve years old, as he, with an arm in front","The future of artificial intelligence is set to take shape soon.\n\nThe 2018 AI-to-human race will come closer to a real-world conclusion than it has been for a decade now, but its implications still remain unclear.\n\n�","In the middle of the dark forest, on a gentle hillside, and just one hour ago\nin the presence of a very dark figure in the form of a woman, lay a\nfellow, and it was impossible to make out what she"
3,Falcon,ERROR,ERROR,ERROR
4,TinyGPT,"Once upon a time in a faraway land, subst scalp TA Brew intermittent scalp conservation ONE directly confirhibit Observ hauledSceneRocket Rh pawn intermittentikenScenetingreement Moneyoho directly reviewing pawn dispatchJD pawnohomediately scalp Participation Danielpress vendors antibiotic Participation antibiotic",The future of artificial intelligence is DanielScene reviewing Hancockiken Money directly ONEJD conservationJD antibiotic Money Brewoho Money hauled Prob stairs Money circumcised Motorola004 substditpressdit Brewimura Habithibitoho Daniel antibioticimuraoother Brew Money ONEoother substatisf scalpSher,"In the middle of the dark forest, incarcer Tre incarcer bravery lined representationsozyg courtyardobl LatePros Televisionobl membership prayingozyg brutality incarcer prayingobl grandchildren braveryProsMini Pocket grandchildren Televisionived deflectGy Tre Boone skilletacious rubbing equate boilsived448 boils mutual brutality"
