# Repo Clone and Env Setup

## Clone fast-detect repo

In [None]:
!git clone https://github.com/baoguangsheng/fast-detect-gpt.git

Cloning into 'fast-detect-gpt'...
remote: Enumerating objects: 762, done.[K
remote: Counting objects: 100% (264/264), done.[K
remote: Compressing objects: 100% (55/55), done.[K
remote: Total 762 (delta 240), reused 209 (delta 209), pack-reused 498 (from 1)[K
Receiving objects: 100% (762/762), 226.69 MiB | 17.01 MiB/s, done.
Resolving deltas: 100% (574/574), done.
Updating files: 100% (503/503), done.


In [None]:
%cd fast-detect-gpt

/content/fast-detect-gpt


## Install dependencies

### Install dependent packages

In [None]:
!pip install torch numpy transformers datasets matplotlib tqdm openai nltk



In [None]:
!curl https://sh.rustup.rs -sSf | sh -s -- -y

[1minfo:[0m downloading installer
[0m[1minfo: [0mprofile set to 'default'
[0m[1minfo: [0mdefault host triple is x86_64-unknown-linux-gnu
[0m[1minfo: [0msyncing channel updates for 'stable-x86_64-unknown-linux-gnu'
[0m[1minfo: [0mlatest update on 2025-09-18, rust version 1.90.0 (1159e78c4 2025-09-14)
[0m[1minfo: [0mdownloading component 'cargo'
[0m[1minfo: [0mdownloading component 'clippy'
[0m[1minfo: [0mdownloading component 'rust-docs'
[0m[1minfo: [0mdownloading component 'rust-std'
[0m[1minfo: [0mdownloading component 'rustc'
[0m[1minfo: [0mdownloading component 'rustfmt'
[0m[1minfo: [0minstalling component 'cargo'
[0m[1minfo: [0minstalling component 'clippy'
[0m[1minfo: [0minstalling component 'rust-docs'
 20.5 MiB /  20.5 MiB (100 %)   8.0 MiB/s in  2s
[0m[1minfo: [0minstalling component 'rust-std'
 27.8 MiB /  27.8 MiB (100 %)  10.3 MiB/s in  4s
[0m[1minfo: [0minstalling component 'rustc'
 78.7 MiB /  78.7 MiB (100 %)  11.1 MiB/s in  

### Modify local_infer.py for Colab

In [None]:
%%writefile scripts/local_infer.py
# Copyright (c) Guangsheng Bao.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import random
import numpy as np
import torch
import os
import glob
import argparse
import json
from model import load_tokenizer, load_model
from fast_detect_gpt import get_sampling_discrepancy_analytic
from scipy.stats import norm

# Considering balanced classification that p(D0) equals to p(D1), we have
# p(D1|x) = p(x|D1) / (p(x|D1) + p(x|D0))
def compute_prob_norm(x, mu0, sigma0, mu1, sigma1):
    pdf_value0 = norm.pdf(x, loc=mu0, scale=sigma0)
    pdf_value1 = norm.pdf(x, loc=mu1, scale=sigma1)
    prob = pdf_value1 / (pdf_value0 + pdf_value1)
    return prob

class FastDetectGPT:
    def __init__(self, args):
        self.args = args
        self.criterion_fn = get_sampling_discrepancy_analytic
        self.scoring_tokenizer = load_tokenizer(args.scoring_model_name, args.cache_dir)
        self.scoring_model = load_model(args.scoring_model_name, args.device, args.cache_dir)
        self.scoring_model.eval()
        if args.sampling_model_name != args.scoring_model_name:
            self.sampling_tokenizer = load_tokenizer(args.sampling_model_name, args.cache_dir)
            self.sampling_model = load_model(args.sampling_model_name, args.device, args.cache_dir)
            self.sampling_model.eval()

        distrib_params = {
            'gpt-j-6B_gpt-neo-2.7B': {'mu0': 0.2713, 'sigma0': 0.9366, 'mu1': 2.2334, 'sigma1': 1.8731},
            'gpt-neo-2.7B_gpt-neo-2.7B': {'mu0': -0.2489, 'sigma0': 0.9968, 'mu1': 1.8983, 'sigma1': 1.9935},
            'falcon-7b_falcon-7b-instruct': {'mu0': -0.0707, 'sigma0': 0.9520, 'mu1': 2.9306, 'sigma1': 1.9039},
        }
        key = f'{args.sampling_model_name}_{args.scoring_model_name}'

        # Fallback for KeyError
        if key not in distrib_params:
            print(f"Warning: Key '{key}' not in distrib_params. Using 'gpt-neo-2.7B_gpt-neo-2.7B' as fallback.")
            key = 'gpt-neo-2.7B_gpt-neo-2.7B'

        self.classifier = distrib_params[key]

    # compute conditional probability curvature
    def compute_crit(self, text):
        tokenized = self.scoring_tokenizer(text, truncation=True, return_tensors="pt", padding=True, return_token_type_ids=False).to(self.args.device)
        labels = tokenized.input_ids[:, 1:]
        if labels.size(1) == 0: # Handle empty or single-token text
            return float('nan'), 0
        with torch.no_grad():
            logits_score = self.scoring_model(**tokenized).logits[:, :-1]
            if self.args.sampling_model_name == self.args.scoring_model_name:
                logits_ref = logits_score
            else:
                tokenized = self.sampling_tokenizer(text, truncation=True, return_tensors="pt", padding=True, return_token_type_ids=False).to(self.args.device)
                assert torch.all(tokenized.input_ids[:, 1:] == labels), "Tokenizer is mismatch."
                logits_ref = self.sampling_model(**tokenized).logits[:, :-1]
            crit = self.criterion_fn(logits_ref, logits_score, labels)
        return crit, labels.size(1)

    # compute probability
    def compute_prob(self, text):
        crit, ntoken = self.compute_crit(text)
        if np.isnan(crit):
            return float('nan'), crit, ntoken
        mu0 = self.classifier['mu0']
        sigma0 = self.classifier['sigma0']
        mu1 = self.classifier['mu1']
        sigma1 = self.classifier['sigma1']
        prob = compute_prob_norm(crit, mu0, sigma0, mu1, sigma1)
        return prob, crit, ntoken

# --- NEW VERSION with Command-Line Text Input ---
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--text', type=str, required=True, help='Text to be analyzed.')
    parser.add_argument('--sampling_model_name', type=str, default="gpt-neo-2.7B")
    parser.add_argument('--scoring_model_name', type=str, default="gpt-neo-2.7B")
    parser.add_argument('--device', type=str, default="cuda")
    parser.add_argument('--cache_dir', type=str, default="../cache")
    args = parser.parse_args()

    # 1. Initialize the detector
    print("Initializing detector...")
    detector = FastDetectGPT(args)
    print("Detector initialized.")

    # 2. Estimate the probability using text from the command line
    print(f"\nAnalyzing text: '{args.text.strip()[:100]}...'")
    prob, crit, ntokens = detector.compute_prob(args.text)

    # 3. Print the result
    print(f'\n--- Result ---')
    if np.isnan(crit):
        print(f'Could not analyze text. It might be too short or invalid.')
    else:
        print(f'Fast-DetectGPT criterion is {crit:.4f}')
        print(f'Probability of being machine-generated: {prob * 100:.0f}%')

Overwriting scripts/local_infer.py


# Run FAST-DETECT-GPT on google colab

In [None]:
!python scripts/local_infer.py --sampling_model_name "gpt-neo-2.7B" --scoring_model_name "gpt-neo-2.7B" --text "I went to the store this morning to buy some groceries. I wasn't sure what to make for dinner, so I just grabbed some chicken, vegetables, and pasta."

Initializing detector...
tokenizer_config.json: 100% 200/200 [00:00<00:00, 1.22MB/s]
config.json: 1.46kB [00:00, 7.48MB/s]
vocab.json: 798kB [00:00, 87.2MB/s]
merges.txt: 456kB [00:00, 113MB/s]
special_tokens_map.json: 100% 90.0/90.0 [00:00<00:00, 226kB/s]
Loading model EleutherAI/gpt-neo-2.7B...
`torch_dtype` is deprecated! Use `dtype` instead!
2025-10-20 19:20:34.321626: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-10-20 19:20:34.338547: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1760988034.359661    1194 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one h