**Welcome to the AI-Generated Text Detection Notebook**


### Inspiration and Credits
This notebook is inspired by the work of s3nh, available at [this Kaggle project]( https://www.kaggle.com/code/hubert101/0-960-phrases-are-keys). I extend my gratitude to s3nh for sharing their insights and code.

lgb para : https://www.kaggle.com/code/zulqarnainali/explained-llm-model

## Importing Libraries


In [1]:
!pip install lightgbm
!pip install catboost




In [2]:
import sys
import gc

import pandas as pd
from sklearn.model_selection import StratifiedKFold
import numpy as np
from sklearn.metrics import roc_auc_score
import numpy as np
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

from datasets import Dataset
from tqdm.auto import tqdm
from transformers import PreTrainedTokenizerFast

from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier



## Loading Data

In [3]:
# data_path= '/kaggle/input/llm-detect-ai-generated-text/'
org_train = pd.read_csv(r"C:\Users\Admin\Desktop\ai_classifier\Final_submission\LLM-Detect-AI-Generated-Text\dataset\train_essays.csv")
test = pd.read_csv(r"C:\Users\Admin\Desktop\ai_classifier\Final_submission\LLM-Detect-AI-Generated-Text\dataset\test_essays.csv")
sub = pd.read_csv(r"C:\Users\Admin\Desktop\ai_classifier\Final_submission\LLM-Detect-AI-Generated-Text\dataset\submission.csv")
train = pd.read_csv(r"C:\Users\Admin\Desktop\ai_classifier\Final_submission\LLM-Detect-AI-Generated-Text\dataset\train_v2_drcat_02.csv",sep=',')

In [4]:
org_train
test

Unnamed: 0,id,prompt_id,text
0,0000aaaa,2,Aaa bbb ccc.
1,1111bbbb,3,Bbb ccc ddd.
2,2222cccc,4,CCC ddd eee.


## Removing Duplicate Rows

In [5]:
train = train.drop_duplicates(subset=['text'])
train.reset_index(drop=True, inplace=True)

## Configuration Parameters

In [6]:
LOWERCASE = False
VOCAB_SIZE = 30522

## Byte-Pair Encoding Tokenizer Training

In [7]:
# Creating Byte-Pair Encoding tokenizer
raw_tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
raw_tokenizer.normalizer = normalizers.Sequence([normalizers.NFC()] + [normalizers.Lowercase()] if LOWERCASE else [])
raw_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.BpeTrainer(vocab_size=VOCAB_SIZE, special_tokens=special_tokens)
dataset = Dataset.from_pandas(test[['text']])
def train_corp_iter(): 
    for i in range(0, len(dataset), 1000):
        yield dataset[i : i + 1000]["text"]
raw_tokenizer.train_from_iterator(train_corp_iter(), trainer=trainer)
tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=raw_tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)
tokenized_texts_test = []

for text in tqdm(test['text'].tolist()):
    tokenized_texts_test.append(tokenizer.tokenize(text))

tokenized_texts_train = []

for text in tqdm(train['text'].tolist()):
    tokenized_texts_train.append(tokenizer.tokenize(text))






  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/44868 [00:00<?, ?it/s]

## TF-IDF Vectorization

In [8]:
def dummy(text):
    return text
vectorizer = TfidfVectorizer(ngram_range=(3, 5), lowercase=False, sublinear_tf=True, analyzer = 'word',
    tokenizer = dummy,
    preprocessor = dummy,
    token_pattern = None, strip_accents='unicode')

vectorizer.fit(tokenized_texts_test)

# Getting vocab
vocab = vectorizer.vocabulary_

print(vocab)

vectorizer = TfidfVectorizer(ngram_range=(3, 5), lowercase=False, sublinear_tf=True, vocabulary=vocab,
                            analyzer = 'word',
                            tokenizer = dummy,
                            preprocessor = dummy,
                            token_pattern = None, strip_accents='unicode'
                            )

tf_train = vectorizer.fit_transform(tokenized_texts_train)
tf_test = vectorizer.transform(tokenized_texts_test)

del vectorizer
gc.collect()

{'ĠAaa Ġbbb Ġccc': 0, 'Ġbbb Ġccc .': 6, 'ĠAaa Ġbbb Ġccc .': 1, 'ĠBbb Ġccc Ġddd': 2, 'Ġccc Ġddd .': 7, 'ĠBbb Ġccc Ġddd .': 3, 'ĠCCC Ġddd Ġeee': 4, 'Ġddd Ġeee .': 8, 'ĠCCC Ġddd Ġeee .': 5}


23

In [9]:
len(tokenized_texts_train)

44868

In [10]:
y_train = train['label'].values
len(y_train)

44868

## Model Training and Prediction

In [11]:
clf = MultinomialNB(alpha=0.02)
#     clf2 = MultinomialNB(alpha=0.01)
sgd_model = SGDClassifier(max_iter=8000, tol=1e-4, loss="modified_huber") 
p6={'n_iter': 2500,
    'verbose': -1,
    'objective': 'cross_entropy',
    'metric': 'auc',
    'boosting_type':'dart',
    'learning_rate': 0.00581909898961407,
    
    'colsample_bytree': 0.78,
    'colsample_bynode': 0.8, 
    'lambda_l1': 4.562963348932286, 
    'lambda_l2': 2.97485, 
    'min_data_in_leaf': 115, 
    'max_depth': 23, 
    'max_bin': 898}

lgb=LGBMClassifier(**p6)
# cat=CatBoostClassifier(iterations=2000,
#                        verbose=0,
#                        l2_leaf_reg=6.6591278779517808,
#                        learning_rate=0.005599066836106983,
#                        subsample = 0.4,
#                        allow_const_label=True,
#                        loss_function = 'CrossEntropy')

weights = [0.03,0.37,0.6]

ensemble = VotingClassifier(estimators=[('mnb',clf),
                                        ('sgd', sgd_model),
                                        ('lgb',lgb)
#                                         ('cat', cat)
                                       ],
                            weights=weights, voting='soft', n_jobs=-1)
ensemble.fit(tf_train, y_train)
gc.collect()
# tf_test = vectorizer.transform(tokenized_texts_test)
final_preds = ensemble.predict_proba(tf_test)[:,1]
sub['generated'] = final_preds
sub.to_csv('submission.csv', index=False)
sub

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Unnamed: 0,id,generated
0,0000aaaa,0.391031
1,1111bbbb,0.391031
2,2222cccc,0.391031
