In [1]:
# ============================
# 0. Install dependencies
# ============================
!pip install -q transformers accelerate pandas tqdm



In [2]:

# ============================
# 1. Imports & setup
# ============================
import torch
import pandas as pd
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from google.colab import files

# ----------------------------
# CONFIGURATION
# ----------------------------

# Model: you can switch to "google/flan-t5-base" if you get OOM errors
MODEL_NAME = "mistralai/Mixtral-8x7B-Instruct-v0.1"

# Name of the column in your CSV that contains the abstract text
ABSTRACT_COLUMN = "abstract"   # change if needed
TITLE_COLUMN = "title"         # set to None if you don't have titles

# Whether to include the title in the text sent to the model
INCLUDE_TITLE = True

# Max number of rows to process (set None to process all)
MAX_ROWS = None     # e.g., 200 for testing

# Device setup: use GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)


Using device: cpu


In [3]:

# ============================
# 1. Imports & setup
# ============================
import torch
import pandas as pd
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM
from google.colab import files

# ----------------------------
# CONFIGURATION
# ----------------------------

# A light, instruction-tuned causal model
MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"

ABSTRACT_COLUMN = "abstract"   # change if needed
TITLE_COLUMN = "title"         # set to None if you don't have titles
INCLUDE_TITLE = True

MAX_ROWS = None   # e.g., 50 for testing

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# ============================
# 2. Load model & tokenizer
# ============================
print("Loading model...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Set a pad token if missing (common for causal models)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
    device_map="auto" if device == "cuda" else None,
)
model.eval()
print("Model loaded.")


Using device: cpu
Loading model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

Model loaded.


In [4]:
# ============================
# 3. Prompt template (simple one-line format)
# ============================

PROMPT_TEMPLATE = """You are a systematic review assistant.
Classify the following paper into one of these labels:

1 = Exclude
2 = Include ‚Äî no psychometrics reported
3 = Include ‚Äî with psychometrics reported

Definitions:
- Psychometrics include at least one of: Cronbach‚Äôs alpha, internal consistency, reliability, test‚Äìretest, ICC, factor analysis (EFA/CFA), validity (construct, convergent, discriminant), Rasch, IRT, measurement invariance.
- Reporting only means/SD or group comparisons is NOT psychometrics.
- If you are unsure whether psychometrics are reported, choose 2.

Return EXACTLY ONE LINE in this format:
<Label number> | <Confidence: High or Medium or Low> | <One short reason>

Example:
3 | High | Reports Cronbach alpha for the main scale.

Now classify this paper:

TEXT:
{paper_text}
"""

print("You can edit PROMPT_TEMPLATE above if needed.\n")

You can edit PROMPT_TEMPLATE above if needed.



In [5]:
# ============================
# 4. Helper: call the model
# ============================

def classify_text_with_model(paper_text: str, max_new_tokens: int = 64) -> str:
    prompt = PROMPT_TEMPLATE.format(paper_text=paper_text)

    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=1024,
    ).to(device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            top_p=None,
            temperature=0.0,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )

    generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # We only care about the part AFTER the prompt
    # So we take the full text and just grab the last line
    full = generated.strip()
    lines = [l.strip() for l in full.splitlines() if l.strip()]
    return lines[-1]  # last non-empty line


def parse_model_output(raw_output: str):
    """
    Expect format:  3 | High | Reports Cronbach alpha.
    """
    try:
        text = raw_output.strip()
        # If model only returned "3", handle gracefully
        if text in ["1", "2", "3"]:
            return int(text), "Low", "Model returned only a numeric label."

        parts = text.split("|")
        if len(parts) < 3:
            return None, None, f"Could not split into 3 parts: {raw_output}"

        label_str = parts[0].strip()
        conf = parts[1].strip()
        reason = parts[2].strip()

        if label_str not in ["1", "2", "3"]:
            return None, None, f"Invalid label: {label_str}"

        if conf not in ["High", "Medium", "Low"]:
            conf = "Low"

        return int(label_str), conf, reason

    except Exception as e:
        return None, None, f"Exception during parsing: {e} | Raw: {raw_output}"

In [6]:

# ============================
# 5. Upload the CSV
# ============================
print("üìÅ Please upload your CSV file (with abstracts)...")
uploaded = files.upload()

if len(uploaded) == 0:
    raise RuntimeError("No file uploaded.")

csv_filename = list(uploaded.keys())[0]
print("Loaded file:", csv_filename)

df = pd.read_csv(csv_filename, encoding_errors="ignore")

if MAX_ROWS is not None:
    df = df.head(MAX_ROWS).copy()

print("Dataframe shape:", df.shape)
print("Columns:", df.columns.tolist())

if ABSTRACT_COLUMN not in df.columns:
    raise ValueError(f"Column '{ABSTRACT_COLUMN}' not found in CSV. Available columns: {df.columns.tolist()}")

# Create output columns
df["llm_label"] = None
df["llm_confidence"] = None
df["llm_reason"] = None
df["llm_raw_output"] = None
df["llm_error"] = None


üìÅ Please upload your CSV file (with abstracts)...


Saving psychometrics_papers.csv to psychometrics_papers.csv
Loaded file: psychometrics_papers.csv
Dataframe shape: (10, 4)
Columns: ['title', 'author', 'abstract', 'doi']


In [7]:

# ============================
# 6. Run classification
# ============================

for idx, row in tqdm(df.iterrows(), total=len(df)):
    # Build paper text (optionally include title)
    parts = []
    if INCLUDE_TITLE and TITLE_COLUMN and TITLE_COLUMN in df.columns:
        title = str(row.get(TITLE_COLUMN, "")).strip()
        if title:
            parts.append(title)
    abstract = str(row.get(ABSTRACT_COLUMN, "")).strip()
    if abstract:
        parts.append(abstract)

    paper_text = "\n\n".join(parts).strip()

    if not paper_text:
        df.at[idx, "llm_error"] = "Empty text"
        continue

    try:
        raw_output = classify_text_with_model(paper_text)
        df.at[idx, "llm_raw_output"] = raw_output

        label, confidence, reason_or_error = parse_model_output(raw_output)

        if label is None:
            df.at[idx, "llm_error"] = reason_or_error
        else:
            df.at[idx, "llm_label"] = label
            df.at[idx, "llm_confidence"] = confidence
            df.at[idx, "llm_reason"] = reason_or_error

    except Exception as e:
        df.at[idx, "llm_error"] = f"Model error: {e}"


  0%|          | 0/10 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


In [8]:

# ============================
# 7. Save & download result
# ============================
output_name = "papers_with_llm_classification.csv"
df.to_csv(output_name, index=False)
print(f"\n‚úÖ Done! Saved results to {output_name}")

files.download(output_name)



‚úÖ Done! Saved results to papers_with_llm_classification.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>