In [None]:
!pip install -q transformers accelerate datasets transformers


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from huggingface_hub import login

login()

# 1- Generate pairs

In [None]:

# Import required modules
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
import pandas as pd
from tqdm import tqdm

# Load the model and tokenizer
model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.float16
)

generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Generation function with tqdm and .txt writing for every 2nd sample
def generate_title_abstract_pairs(n=100, txt_path="/content/selected_pairs4.txt"):
    prompt_template = (
        "<|system|>\nYou are a helpful academic assistant.<|end|>\n"
        "<|user|>\nGenerate a computer science research paper title and an abstract with at least 7 sentences.\n"
        "Format:\nTitle: ...\nAbstract: ...\n<|end|>\n<|assistant|>\n"
    )
    results = []
    with open(txt_path, "w", encoding="utf-8") as txt_file:
        for i in tqdm(range(n), desc="Generating title–abstract pairs"):
            output = generator(
                prompt_template,
                max_new_tokens=512,
                do_sample=True,
                temperature=0.8,
                top_p=0.95
            )[0]["generated_text"]

            response = output.split("<|assistant|>")[-1].strip()

            if "Title:" in response and "Abstract:" in response:
                try:
                    title = response.split("Title:")[1].split("Abstract:")[0].strip()
                    abstract = response.split("Abstract:")[1].strip()
                    results.append({"title": title, "abstract": abstract})

                    # Write to .txt every 2nd pair (i % 2 == 0)
                    if i % 2 == 0:
                        txt_file.write(f"Title: {title}\n")
                        txt_file.write(f"Abstract: {abstract}\n")
                        txt_file.write("-" * 40 + "\n")
                except:
                    continue
    return pd.DataFrame(results)

# Generate 100 pairs and save
df = generate_title_abstract_pairs(n=1000)

# Save full CSV
csv_path = "/content/tinyllama_cs_pairs4.csv"
df.to_csv(csv_path, index=False)
print(f"\nCSV saved to: {csv_path}")
print(" .txt with selected pairs saved to: /content/selected_pairs4.txt")

# Show preview
df.head()


Copy to drive

In [10]:
!cp /content/tinyllama_cs_pairs3.csv /content/drive/MyDrive/

In [None]:
import shutil

source_path = '/content/tinyllama_cs_pairs4.csv'
destination_path = '/content/drive/MyDrive/tinyllama_cs_pairs4.csv'

shutil.copy(source_path, destination_path)