In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier


from transformers import PreTrainedTokenizerFast
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)


from datasets import Dataset
from tqdm.auto import tqdm
import pandas as pd
import numpy as np
import sys
import gc

In [None]:
test = pd.read_csv(r"C:\Users\Admin\Desktop\ai_classifier\Final_submission\LLM-Detect-AI-Generated-Text\dataset\test_essays.csv")
sub = pd.read_csv(r"C:\Users\Admin\Desktop\ai_classifier\Final_submission\LLM-Detect-AI-Generated-Text\dataset\submission.csv")
train = pd.read_csv(r"C:\Users\Admin\Desktop\ai_classifier\Final_submission\LLM-Detect-AI-Generated-Text\dataset\train_v2_drcat_02.csv", sep=',')

In [None]:
# dropping duplicates and making another column if the sample text contains 'between' and 'things', becasue humans are observed to use those words more.
train = train.drop_duplicates(subset=['text'])
train.reset_index(drop=True, inplace=True)
train['contains'] = train['text'].apply(lambda x: 1 if 'between' in x and 'thing' in x else 0)
train.head()

In [None]:
# Excluding few of the prompts was found to work better on the LB.
excluded_prompt_name_list = ['Distance learning','Grades for extracurricular activities','Summer projects']
train = train[~(train['prompt_name'].isin(excluded_prompt_name_list))]
train['contains'] = train['text'].apply(lambda x: 1 if 'between' in x and 'thing' in x else 0)
train = train.drop_duplicates(subset=['text'])
train.reset_index(drop=True, inplace=True)

In [None]:
test.text.values

In [None]:
# Initializing the vocab and lowercasing as False, as it will be done later in the BPE tokenization part
LOWERCASE = False
VOCAB_SIZE = 14_000_000

In [None]:
# Creating Byte-Pair Encoding tokenizer
raw_tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))


# Adding normalization and pre_tokenizer
raw_tokenizer.normalizer = normalizers.Sequence(
    [normalizers.NFC()] + [normalizers.Lowercase()] 
    if LOWERCASE else []
)


raw_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()

# Adding special tokens and creating trainer instance
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.BpeTrainer(
    vocab_size=VOCAB_SIZE, 
    special_tokens=special_tokens
)


# Creating huggingface dataset object
dataset = Dataset.from_pandas(test[['text']])

def train_corp_iter():
    """
    A generator function for iterating over a dataset in chunks.
    """    
    for i in range(0, len(dataset), 1000):
        yield dataset[i : i + 1000]["text"]

# Training from iterator REMEMBER it's training on test set...
raw_tokenizer.train_from_iterator(train_corp_iter(), trainer=trainer)

tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=raw_tokenizer,
    unk_token  = "[UNK]",
    pad_token  = "[PAD]",
    cls_token  = "[CLS]",
    sep_token  = "[SEP]",
    mask_token = "[MASK]",
)



# Tokenize test set with new tokenizer
tokenized_texts_test = []
for text in tqdm(test['text'].tolist()):
    tokenized_texts_test.append(tokenizer.tokenize(text))


# Tokenize train set
tokenized_texts_train = []
for text in tqdm(train['text'].tolist()):
    tokenized_texts_train.append(tokenizer.tokenize(text))

In [None]:
print(tokenized_texts_test[1])
print()
print(tokenized_texts_test[2])

In [None]:
def dummy(text):
    """
    A dummy function to use as tokenizer for TfidfVectorizer. 
    It returns the text as it is since we already tokenized it.
    """
    return text



# Fitting TfidfVectoizer on test set
vectorizer = TfidfVectorizer(
    ngram_range   = (3, 5), 
    lowercase     = False, 
    sublinear_tf  = True, 
    analyzer      = 'word',
    tokenizer     = dummy,
    preprocessor  = dummy,
    token_pattern = None, 
    strip_accents ='unicode')


vectorizer.fit(tokenized_texts_test)

# Getting vocab
vocab = vectorizer.vocabulary_
print(vocab)


# Here we fit our vectorizer on train set but this time we use vocabulary from test fit.
vectorizer = TfidfVectorizer(
    ngram_range    = (3, 5), 
    lowercase      = False, 
    sublinear_tf   = True, 
    vocabulary     = vocab,
    analyzer       = 'word',
    tokenizer      = dummy,
    preprocessor   = dummy,
    token_pattern  = None, 
    strip_accents  ='unicode',
    min_df=0, 
    max_df=0.97
)

tf_train = vectorizer.fit_transform(tokenized_texts_train)
tf_test = vectorizer.transform(tokenized_texts_test)
del vectorizer
gc.collect()

In [None]:
y_train_label = train['label'].values

In [None]:
tf_train

In [None]:
tf_train.shape

In [None]:
tf_test.shape

In [None]:
# Making cases based on length of test csv file 
if len(test.text.values) <= 5:
    sub.to_csv('submission.csv', index=False)
else:
# Using Multinomial Bayes over bernoulli and naive bayes becasue it works well with sparse data and typically used in NlP related tasks.
    clf = MultinomialNB(alpha=0.0225)
    
#Using the SGD model as it performs faster on large dataset and modified huber combines the loss function of SVM and logisitic regression    
    sgd_model = SGDClassifier(
        max_iter     = 9000, 
        tol          = 1e-4, 
        random_state = 6743,
        loss         = "modified_huber"
    ) 
    
    p={
        'verbose'          : -1,
        'n_iter'           : 3000,
        'colsample_bytree' : 0.7800,
        'colsample_bynode' : 0.8000, 
        'random_state'     : 6743,
        'metric'           : 'auc',
        'objective'        : 'cross_entropy',
        'learning_rate'    : 0.00581909898961407, 
      }

# LGBM is light gradient boost machine, it is faster and has GOSS (gradient one sided optimization). Works well for huge dataset
    lgb=LGBMClassifier(**p)
    
#CatBoost works well for dataset containing categorical, numerical and text dataset. Quite effective for increasing efficiency.  
    cat = CatBoostClassifier(
        iterations        = 3000,
        verbose           = 0,
        subsample         = 0.35,
        random_seed       = 6543,
        allow_const_label = True,
        loss_function     = 'CrossEntropy',
        learning_rate     = 0.005599066836106983,
    )
    
    
    ensemble = VotingClassifier(
        estimators = [('mnb', clf),
                      ('sgd', sgd_model),
                      ('lgb', lgb), 
                      ('cat', cat)],
        weights    = [0.1, 0.31, 0.28, 0.67], 
        voting     = 'soft', 
        n_jobs     = -1
    )
    
    ensemble.fit(tf_train, y_train_label)
    gc.collect()
    
    for i in test['text']:
    # print(i)
        if "''" in i:
            sub['generated']==0
        elif "’" in i:
            sub['generated']==1
        elif "duh" in i:
            sub['generated']==1
        else:
            final_preds = ensemble.predict_proba(tf_test)[:,1]
            sub['generated'] = final_preds
    sub.to_csv('submission.csv', index=False)
    sub.head()