# Credit
Fork from 

: https://www.kaggle.com/code/rsuhara/ai-generated-text-detection-quick-baseline

Inspired by : <br>
https://www.kaggle.com/code/yekenot/llm-detect-by-regression

https://www.kaggle.com/code/xiaocao123/ai-generated-text-detection-add-new-data

For the training data we shall use the "RDizzl3 seven" dataset (v1) which can be found in the "LLM: 7 prompt training dataset" https://www.kaggle.com/datasets/carlmcbrideellis/llm-7-prompt-training-dataset

add this dataset :

https://www.kaggle.com/datasets/thedrcat/daigt-proper-train-dataset

# Importing library

In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, roc_auc_score

isGridSearch = False

# ALL weights caculate by rerunning grid search
# weights = [0.10526315789473684, 0.8947368421052632] 
# weights = [0.05,0.95]
weights = [0,1]



# Load datasets

In [2]:
external_df = pd.read_csv("/kaggle/input/daigt-external-dataset/daigt_external_dataset.csv", sep=',')
train = pd.read_csv("/kaggle/input/llm-7-prompt-training-dataset/train_essays_RDizzl3_seven_v2.csv")
test = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')

addtrain1 = pd.read_csv("/kaggle/input/daigt-proper-train-dataset/train_drcat_04.csv")

## add more data

In [3]:
add_train=addtrain1[addtrain1["label"]==1]#.sample(3000)
add_train=pd.concat([add_train,add_train,add_train])
add_train=add_train[["text","label"]]
add_train['text'] = add_train['text'].str.replace('\n', '')
add_train.head(5)

Unnamed: 0,text,label
0,"In recent years, technology has had a profoun...",1
4,I strongly believe that meditation and mindful...,1
9,One way school administrators can attempt to c...,1
11,While summer is meant as a break from the regu...,1
12,The use of Facial Action Coding System (FACS) ...,1


In [4]:
len(add_train)

43242

# Preprocess and merge datasets

In [5]:
external_df = external_df.rename(columns={'generated': 'label'})
external_df = external_df[["source_text"]]
external_df.columns = ["text"]
external_df['text'] = external_df['text'].str.replace('\n', '')
external_df["label"] = 1
external_df = pd.concat([external_df,external_df,external_df])

train = pd.concat([train, external_df, add_train])
train.head(5)

Unnamed: 0,text,label
0,Cars. Cars have been around since they became ...,0
1,Transportation is a large necessity in most co...,0
2,"""America's love affair with it's vehicles seem...",0
3,How often do you ride in a car? Do you drive a...,0
4,Cars are a wonderful thing. They are perhaps o...,0


In [6]:
train.value_counts("label")

label
1    53509
0    14247
Name: count, dtype: int64

# Preprocessing function

In [7]:
# def preprocess_text(text):
# #     text = text.lower()
#     text = re.sub(r'http\S+', '', text)  # Remove URLs
#     text = re.sub(r'\b\w{1,2}\b', '', text)  # Remove short words
#     return text

# external_df['text'] = external_df['text'].apply(preprocess_text)
# train['text'] = train['text'].apply(preprocess_text)
# test['text'] = test['text'].apply(preprocess_text)

# Combine train and test text
df = pd.concat([train['text'], test['text']], axis=0)
df.head(10)

0    Cars. Cars have been around since they became ...
1    Transportation is a large necessity in most co...
2    "America's love affair with it's vehicles seem...
3    How often do you ride in a car? Do you drive a...
4    Cars are a wonderful thing. They are perhaps o...
5    The electrol college system is an unfair syste...
6    Dear state senator, It is the utmost respect t...
7    Fellow citizens, cars have become a major role...
8    "It's official: The electoral college is unfai...
9    The Electoral College has been kept for centur...
Name: text, dtype: object

# Feature extraction

In [8]:
vectorizer = TfidfVectorizer(ngram_range=(1, 3),sublinear_tf=True)
X = vectorizer.fit_transform(df)

# Model initialization

In [9]:
lr_model = LogisticRegression(solver="liblinear")
sgd_model = SGDClassifier(max_iter=1000, tol=1e-3, loss="modified_huber")
# rf_model = RandomForestClassifier(n_estimators=100)
# nb_model = MultinomialNB()

# Create the ensemble model

In [10]:
ensemble = VotingClassifier(estimators=[('lr', lr_model), 
                                        #('rf', rf_model),
                                        ('sgd', sgd_model),                                      
                                        #('nb', nb_model)
                                       ],
                            weights=weights,
                            voting='soft')

# Define a range of weights

In [11]:
if not isGridSearch:
    ensemble.fit(X[:train.shape[0]], train.label)
    preds_test = ensemble.predict_proba(X[train.shape[0]:])[:,1]
else:
    weights = np.linspace(0, 1, 20)
    weight_combinations = [(w, 1-w) for w in weights]

    # Define the parameter grid
    param_grid = {'weights': weight_combinations}

    # Define a scorer, for example, accuracy
    scorer = make_scorer(roc_auc_score)

    # Initialize GridSearchCV
    grid_search = GridSearchCV(estimator=ensemble, 
                               param_grid=param_grid, 
                               scoring=scorer, 
                               cv=5)

    # Fit the grid search to the data
    grid_search.fit(X[:train.shape[0]], train.label)

    # Find the best parameters
    best_weights = grid_search.best_params_['weights']
    print(f"Best Weights: {best_weights}")
    preds_test = grid_search.predict_proba(X[train.shape[0]:])[:, 1]

# Predictions

In [12]:
pd.DataFrame({'id': test["id"], 'generated': preds_test}).to_csv('submission.csv', index=False)