# Install sparse_dot_topn for fast sparse vector matching

https://github.com/ing-bank/sparse_dot_topn

In [None]:
!pip install /kaggle/input/sparse-dot-topn-033/sparse_dot_topn-0.3.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

# Define TextMatcher

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sparse_dot_topn import awesome_cossim_topn


class TextMatcher:
    def __init__(self, ground_truth, col, topk=5, lower_bound=-1):
        self.ground_truth = ground_truth
        self.vec = TfidfVectorizer(ngram_range=(1, 2), analyzer="word", token_pattern=r"(?u)(\b\w\w+\b|[\.,!])",
                                   use_idf=False, min_df=2, binary=True)
        self.topk = topk
        self.lower_bound = lower_bound
        self.col = col
        
    def get_matches_df(self, sparse_matrix, texts):
        non_zeros = sparse_matrix.nonzero()

        text_indices = non_zeros[0]
        gt_indices = non_zeros[1]

        left_side = np.empty(gt_indices.size, dtype=object)
        right_side = np.empty(gt_indices.size, dtype=object)
        match_score = np.zeros(gt_indices.size)

        for index in range(gt_indices.size):
            left_side[index] = texts.values[text_indices[index]]
            right_side[index] = self.ground_truth[self.col].values[gt_indices[index]]
            match_score[index] = sparse_matrix.data[index]

        res_df = pd.DataFrame({self.col: left_side,
                               'Ground Truth': right_side,
                               'match_score': match_score})

        res_df = pd.DataFrame(texts).merge(res_df, on=self.col, how="left")
        return res_df


    def match(self, texts_to_match, n_threads=16):
        print(f"Matching {texts_to_match.shape[0]} texts to {self.ground_truth.shape[0]} texts...")
        
        X = self.vec.fit_transform(texts_to_match[self.col])
        X_gt = self.vec.transform(self.ground_truth[self.col])
        
        sparse_sim = awesome_cossim_topn(X, X_gt.T, self.topk, self.lower_bound, use_threads=True, n_jobs=n_threads)
        
        return self.get_matches_df(sparse_sim, texts_to_match[self.col])

# Load data

In [None]:
test = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')
sub = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/sample_submission.csv')

# Get kth similarity score

30th best match score within likely students and within likely LLMs are calculated.

In [None]:
TOPK = 30


def get_match_score(df, gt_filter_col):
    tm = TextMatcher(df[df[gt_filter_col]].reset_index(drop=True), "text", topk=TOPK)
    res_df = tm.match(df, n_threads=4)
    df = res_df.groupby("text")["match_score"].min().reset_index().merge(df, on="text")
    return df

def find_matchscore(df):
    all_prompts = df["prompt_id"].unique()

    sub_dfs = [get_match_score(df[df["prompt_id"] == pid], "likely_student").reset_index(drop=True)[["id", "match_score"]]
               for pid in all_prompts]
    sub_df = pd.concat(sub_dfs).rename(columns={"match_score": "match_score_student"})


    sub_dfs = [get_match_score(df[df["prompt_id"] == pid], "likely_llm").reset_index(drop=True)[["id", "match_score"]]
               for pid in all_prompts]
    sub_df2 = pd.concat(sub_dfs).rename(columns={"match_score": "match_score_llm"})

    sub_df = sub_df.merge(sub_df2, on="id")
    return sub_df

# Make submission

Ratio between student match score and smoothed LLM match score determines the ranking of essays.

In [None]:
import sys
import gc
import transformers
import datasets
import pandas as pd
import numpy as np
from datasets import Dataset
import os
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
from transformers import AutoTokenizer
if len(test.text.values) <= 5:
    sub.to_csv('submission.csv', index=False)
else:
    model_checkpoint = "/kaggle/input/detect-llm-models/distilroberta-finetuned_v5/checkpoint-9028"
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    def preprocess_function(examples):
        return tokenizer(examples['text'], max_length = 512 , padding=True, truncation=True)
    num_labels = 2
    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        # Move your model and data to the GPU
    model.to(device);
    trainer = Trainer(
        model,
        tokenizer=tokenizer,
    )
    test = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')
    test_ds = Dataset.from_pandas(test)
    test_ds_enc = test_ds.map(preprocess_function, batched=True)
    test_preds = trainer.predict(test_ds_enc)
    logits = test_preds.predictions
    probs = (np.exp(logits) / np.sum(np.exp(logits), axis=-1, keepdims=True))[:,0]
    test["probs"]=probs
    test["likely_student"] = test["probs"] < 0.1
    test["likely_llm"] = test["probs"] > 0.9
    sub=find_matchscore(test)
    SMOOTH = 0.15
    sub["generated"] = -sub["match_score_student"] / (sub["match_score_llm"] + SMOOTH)
    sub.to_csv("submission.csv", index=False, columns=["id", "generated"])