## Imports

In [6]:
import pandas as pd
import numpy as np
import regex as re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

## Submission Flag

In [7]:
is_submission = False

## Read Datasets

In [8]:
train_path1 = r"data\train_v2_drcat_02.csv" if not is_submission else r"/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_02.csv"
train_path2 = r"data\train_essays.csv" if not is_submission else r"/kaggle/input/llm-detect-ai-generated-text/train_essays.csv"
test_path = r"data\test_essays.csv" if not is_submission else r"/kaggle/input/llm-detect-ai-generated-text/test_essays.csv"
train_data1 = pd.read_csv(train_path1)
train_data1.rename(columns={'label': 'generated'}, inplace=True)
train_data2 = pd.read_csv(train_path2)
test_data = pd.read_csv(test_path)

In [9]:
train_data1

Unnamed: 0,text,generated,prompt_name,source,RDizzl3_seven
0,Phones\n\nModern humans today are always on th...,0,Phones and driving,persuade_corpus,False
1,This essay will explain if drivers should or s...,0,Phones and driving,persuade_corpus,False
2,Driving while the use of cellular devices\n\nT...,0,Phones and driving,persuade_corpus,False
3,Phones & Driving\n\nDrivers should not be able...,0,Phones and driving,persuade_corpus,False
4,Cell Phone Operation While Driving\n\nThe abil...,0,Phones and driving,persuade_corpus,False
...,...,...,...,...,...
44863,"Dear Senator,\n\nI am writing to you today to ...",1,Does the electoral college work?,kingki19_palm,True
44864,"Dear Senator,\n\nI am writing to you today to ...",1,Does the electoral college work?,kingki19_palm,True
44865,"Dear Senator,\n\nI am writing to you today to ...",1,Does the electoral college work?,kingki19_palm,True
44866,"Dear Senator,\n\nI am writing to you today to ...",1,Does the electoral college work?,kingki19_palm,True


In [14]:
train_data1[['text','generated']]

Unnamed: 0,text,generated
0,Phones\n\nModern humans today are always on th...,0
1,This essay will explain if drivers should or s...,0
2,Driving while the use of cellular devices\n\nT...,0
3,Phones & Driving\n\nDrivers should not be able...,0
4,Cell Phone Operation While Driving\n\nThe abil...,0
...,...,...
44863,"Dear Senator,\n\nI am writing to you today to ...",1
44864,"Dear Senator,\n\nI am writing to you today to ...",1
44865,"Dear Senator,\n\nI am writing to you today to ...",1
44866,"Dear Senator,\n\nI am writing to you today to ...",1


In [18]:
train = pd.concat([train_data1[['text','generated']], train_data2[['text','generated']]])
train['text'] = train['text'].str.replace('\n', '')
test_data['text'] = test_data['text'].str.replace('\n', '')
train['generated'].value_counts()

generated
0    28746
1    17500
Name: count, dtype: int64

In [31]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=42)
train_text, train_label = rus.fit_resample(train['text'].to_numpy().reshape(-1,1), train['generated'].to_numpy().reshape(-1,1))
print('0: ', np.count_nonzero(train_label == 0))
print('1: ', np.count_nonzero(train_label == 1))

data = {'text': train_text.reshape(-1), 'generated': train_label.reshape(-1)}
train_data = pd.DataFrame(data)

0:  17500
1:  17500


## Embeddings

In [32]:
vectorizer = TfidfVectorizer(ngram_range=(3, 5),
                             tokenizer=lambda x: re.findall(r'[^\W]+', x),
                             token_pattern=None,
                             strip_accents='unicode',)
vectorized_train_data = vectorizer.fit_transform(train_data["text"])
vectorized_test_data = vectorizer.transform(test_data["text"])

## Create Model

In [33]:
lr_model=LogisticRegression()
clf_model = MultinomialNB(alpha=0.02)
sgd_model1 = SGDClassifier(max_iter=8000, tol=1e-3, loss="modified_huber")   
sgd_model2 = SGDClassifier(max_iter=10000, tol=5e-4, loss="modified_huber", class_weight="balanced") 
sgd_model3 = SGDClassifier(max_iter=15000, tol=3e-4, loss="modified_huber", early_stopping=True)
ensemble = VotingClassifier(estimators=[('lr', lr_model),
                                        ('mnb', clf_model),
                                        ('sgd1', sgd_model1),
                                        ('sgd2', sgd_model2),
                                        ('sgd3', sgd_model3),
                                       ],
                            weights=[0.20, 0.20, 0.20, 0.20, 0.20],
                            voting='soft'
                           )

## Fit Model

In [39]:
train_data.shape[0]

35000

In [40]:
ensemble.fit(vectorized_train_data[:train_data.shape[0]], train_data.generated)

KeyboardInterrupt: 

## Predict Test Set

In [36]:
predictions = ensemble.predict(vectorized_test_data).astype(float)

## Create Submission

In [38]:
submission = pd.DataFrame({'id':test_data["id"], 'generated':predictions})
submission_path = r"data\submission.csv" if not is_submission else r"/kaggle/working/submission.csv"
submission.to_csv(submission_path, index=False)