In [None]:
!pip install scikit-learn pandas numpy nltk tensorflow spacy tqdm

In [None]:
import string
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import nltk
import re
import tensorflow as tf
import json
import csv
import spacy
from tqdm import tqdm
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, classification_report
import pandas as pd

In [None]:
def train_dev_jsonl_to_csv(jsonl_file, csv_file):
    csv_columns = ['id', 'text', 'label']
    
    with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
        writer.writeheader()
        
        with open(jsonl_file, 'r', encoding='utf-8') as f:
            for line in f:
                data = json.loads(line) 
                filtered_data = {key: data[key] for key in csv_columns}                
                writer.writerow(filtered_data)

def test_jsonl_to_csv(jsonl_file, csv_file):
    csv_columns = ['id', 'text']
    
    with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
        writer.writeheader()
        
        with open(jsonl_file, 'r', encoding='utf-8') as f:
            for line in f:
                data = json.loads(line)
                writer.writerow(data)

In [None]:
en_train_jsonl_file = 'en_train.jsonl'
en_train_csv_file = 'en_train.csv'
train_dev_jsonl_to_csv(en_train_jsonl_file, en_train_csv_file)
print(f"Data successfully written to {en_train_csv_file}")

en_dev_jsonl_file = 'en_dev.jsonl'
en_dev_csv_file = 'en_dev.csv'
train_dev_jsonl_to_csv(en_dev_jsonl_file, en_dev_csv_file)
print(f"Data successfully written to {en_dev_csv_file}")

en_test_jsonl_file = 'en_devtest_text_id_only.jsonl'
en_test_csv_file = 'en_devtest.csv'
test_jsonl_to_csv(en_test_jsonl_file, en_test_csv_file)
print(f"Data successfully written to {en_test_csv_file}")

# add datasets
train = pd.read_csv("en_train.csv")
dev = pd.read_csv("en_dev.csv")

In [None]:
train['label'] = train['label'].astype(float)
dev['label'] = dev['label'].astype(float)

In [None]:
train

In [None]:
dev

In [None]:
class TextPreprocessor:
    def _init_(self, data):
        self.data = data
        self.nlp = spacy.load('en_core_web_sm')
        self.stemmer = SnowballStemmer('english')
    
    def clean_text(self, text):
        """Clean the input text by removing URLs, mentions, hashtags, numbers, punctuations, etc."""
        text = re.sub(r"@\S+", "", text)  
        text = re.sub(r"http[s]?\://\S+", "", text) 
        text = re.sub(r"#\S+", "", text)  
        text = re.sub(r"[0-9]", "", text) 
        text = re.sub(r"[\[\]()]", "", text)
        text = re.sub(r"\n", "", text)  
        text = text.translate(str.maketrans('', '', string.punctuation)) 
        text = re.sub(r'[^\w\s]', '', text) 
        text = text.lower()  
        text = re.sub(r"\s+", " ", text).strip()  
        return text if text else "no text"
    
    def lemmatize_sentence(self, sentence):
        """Apply lemmatization to a sentence using SpaCy."""
        doc = self.nlp(sentence)
        lemmatized_sentence = " ".join([token.lemma_ for token in doc])
        return lemmatized_sentence
    
    def stem_sentence(self, sentence):
        """Apply stemming to a sentence using NLTK's Snowball Stemmer."""
        words = sentence.split()
        stemmed_words = [self.stemmer.stem(word) for word in words]
        stemmed_sentence = " ".join(stemmed_words)
        return stemmed_sentence
    
    def pos_tagging(self, sentence):
        """Perform POS tagging on a sentence using SpaCy."""
        doc = self.nlp(sentence)
        pos_tags = [token.pos_ for token in doc]
        return " ".join(pos_tags)
    
    def process_data(self):
        """Process the entire DataFrame, applying cleaning, lemmatization, stemming, and POS tagging."""
        # Clean the text
        self.data['clean_text'] = self.data['text'].apply(self.clean_text)
        
        # Lemmatize the text
        lemmatized_text = []
        for sentence in tqdm(self.data['clean_text'], desc='Lemmatizing'):
            lemmatized_sentence = self.lemmatize_sentence(sentence)
            lemmatized_text.append(lemmatized_sentence)
        self.data['lemmatized_text'] = lemmatized_text
        
        # Stem the text
        stemmed_text = []
        for sentence in tqdm(self.data['clean_text'], desc='Stemming'):
            stemmed_sentence = self.stem_sentence(sentence)
            stemmed_text.append(stemmed_sentence)
        self.data['stemmed_text'] = stemmed_text
        
        # POS Tagging
        pos_tags = []
        for sentence in tqdm(self.data['clean_text'], desc='POS tagging'):
            pos_sentence = self.pos_tagging(sentence)
            pos_tags.append(pos_sentence)
        self.data['pos'] = pos_tags

        self.data['combined_text'] = data['clean_text'] + ' ' + data['lemmatized_text'] + ' ' + data['stemmed_text'] + ' ' + data['pos']
    
    def get_processed_data(self):
        """Return the processed DataFrame with clean_text, lemmatized_text, stemmed_text, and pos columns."""
        return self.data

In [None]:
train_preprocessor = TextPreprocessor(train)
dev_preprocessor = TextPreprocessor(dev)

train_preprocessor.process_data()
dev_preprocessor.process_data()

In [None]:
train_data = train_preprocessor.get_processed_data()
dev_data = dev_preprocessor.get_processed_data()
train_data['combined_text'] = train_data['clean_text'] + ' ' + train_data['lemmatized_text'] + ' ' + train_data['stemmed_text'] + ' ' + train_data['pos']
dev_data['combined_text'] = dev_data['clean_text'] + ' ' + dev_data['lemmatized_text'] + ' ' + dev_data['stemmed_text'] + ' ' + dev_data['pos']

In [None]:
train_data

In [None]:
dev_data

In [None]:
# TF-IDF Vectorization using all combined text features
tfidf = TfidfVectorizer(max_features=5000)
train_x_tfidf = tfidf.fit_transform(train_x).toarray()
valid_x_tfidf = tfidf.transform(valid_x).toarray()
test_x_tfidf = tfidf.transform(test_x).toarray()

# Convert labels to numpy arrays
train_y = np.array(train_y)
valid_y = np.array(valid_y)
test_y = np.array(test_y)

# Create TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((train_x_tfidf, train_y)).batch(64).prefetch(tf.data.AUTOTUNE)
valid_dataset = tf.data.Dataset.from_tensor_slices((valid_x_tfidf, valid_y)).batch(64).prefetch(tf.data.AUTOTUNE)
test_dataset = tf.data.Dataset.from_tensor_slices((test_x_tfidf, test_y)).batch(64).prefetch(tf.data.AUTOTUNE)

In [None]:
def build_model():
    """Build and compile the CNN model."""
    model = Sequential([
        Conv1D(128, 3, activation='relu', input_shape=(train_x_tfidf_reshaped.shape[1], 1)),
        MaxPooling1D(2),
        Conv1D(64, 3, activation='relu'),
        MaxPooling1D(2),
        Flatten(),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# K-Fold Cross-Validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Store overall predictions and true labels for validation and test sets
overall_valid_true, overall_valid_pred = [], []
overall_test_true, overall_test_pred = [], []

fold = 1

for train_index, val_index in kf.split(train_x_tfidf_reshaped):
    print(f"\n\n--- Fold {fold} ---")

    # Split data for the current fold
    X_train, X_val = train_x_tfidf_reshaped[train_index], train_x_tfidf_reshaped[val_index]
    y_train, y_val = train_y[train_index], train_y[val_index]

    # Build and train the model
    model_cnn_kfold = build_model()
    model_cnn_kfold.fit(X_train, y_train, validation_data=(valid_x_tfidf_reshaped, valid_y), epochs=5, verbose=1)

    # Validation predictions
    val_pred_y = (model_cnn_kfold.predict(X_val) > 0.5).astype("int32").flatten()
    overall_valid_true.extend(y_val)
    overall_valid_pred.extend(val_pred_y)

    # Test predictions
    test_pred_y = (model_cnn_kfold.predict(test_x_tfidf_reshaped) > 0.5).astype("int32").flatten()
    overall_test_true.extend(test_y)
    overall_test_pred.extend(test_pred_y)

    fold += 1

# Compute overall validation metrics
overall_valid_accuracy = accuracy_score(overall_valid_true, overall_valid_pred)
overall_valid_f1 = f1_score(overall_valid_true, overall_valid_pred)
overall_valid_mcc = matthews_corrcoef(overall_valid_true, overall_valid_pred)
overall_valid_cmd = confusion_matrix(overall_valid_true, overall_valid_pred)
overall_valid_report = classification_report(overall_valid_true, overall_valid_pred)

# Compute overall test metrics
overall_test_accuracy = accuracy_score(overall_test_true, overall_test_pred)
overall_test_f1 = f1_score(overall_test_true, overall_test_pred)
overall_test_mcc = matthews_corrcoef(overall_test_true, overall_test_pred)
overall_test_cmd = confusion_matrix(overall_test_true, overall_test_pred)
overall_test_report = classification_report(overall_test_true, overall_test_pred)

# Display overall validation metrics
print("\n\n--- Overall Validation Metrics ---")
print(f"Accuracy: {overall_valid_accuracy}")
print(f"F1 Score: {overall_valid_f1}")
print(f"MCC: {overall_valid_mcc}")
print("Confusion Matrix:\n", overall_valid_cmd)
print("Classification Report:\n", overall_valid_report)

# Display overall test metrics
print("\n\n--- Overall Test Metrics ---")
print(f"Accuracy: {overall_test_accuracy}")
print(f"F1 Score: {overall_test_f1}")
print(f"MCC: {overall_test_mcc}")
print("Confusion Matrix:\n", overall_test_cmd)
print("Classification Report:\n", overall_test_report)

# Plot confusion matrices
ConfusionMatrixDisplay(confusion_matrix=overall_valid_cmd, display_labels=['Human', 'Generated']).plot()
ConfusionMatrixDisplay(confusion_matrix=overall_test_cmd, display_labels=['Human', 'Generated']).plot()

model_cnn_kfold.save('model_CNN_Kfold.h5')

In [None]:
def build_lstm_model():
    """Build and compile the LSTM-like dense model."""
    model = Sequential()
    model.add(Dense(128, activation='relu', input_shape=(train_x_tfidf.shape[1],)))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# K-Fold Cross-Validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Store overall predictions and true labels for validation and test sets
overall_valid_true, overall_valid_pred = [], []
overall_test_true, overall_test_pred = [], []

fold = 1

# Perform K-Fold Cross-Validation
for train_index, val_index in kf.split(train_x_tfidf):
    print(f"\n\n--- Fold {fold} ---")

    # Split data for the current fold
    X_train, X_val = train_x_tfidf[train_index], train_x_tfidf[val_index]
    y_train, y_val = train_y[train_index], train_y[val_index]

    # Build and train the model
    model = build_lstm_model()
    model.fit(X_train, y_train, validation_data=(valid_x_tfidf, valid_y), epochs=5, verbose=1)

    # Validation predictions
    val_pred_y = (model.predict(X_val) > 0.5).astype("int32").flatten()
    overall_valid_true.extend(y_val)
    overall_valid_pred.extend(val_pred_y)

    # Test predictions
    test_pred_y = (model.predict(test_x_tfidf) > 0.5).astype("int32").flatten()
    overall_test_true.extend(test_y)
    overall_test_pred.extend(test_pred_y)

    fold += 1

# Compute overall metrics for validation
overall_valid_accuracy = accuracy_score(overall_valid_true, overall_valid_pred)
overall_valid_f1 = f1_score(overall_valid_true, overall_valid_pred)
overall_valid_mcc = matthews_corrcoef(overall_valid_true, overall_valid_pred)
overall_valid_cmd = confusion_matrix(overall_valid_true, overall_valid_pred)
overall_valid_report = classification_report(overall_valid_true, overall_valid_pred)

# Compute overall metrics for test
overall_test_accuracy = accuracy_score(overall_test_true, overall_test_pred)
overall_test_f1 = f1_score(overall_test_true, overall_test_pred)
overall_test_mcc = matthews_corrcoef(overall_test_true, overall_test_pred)
overall_test_cmd = confusion_matrix(overall_test_true, overall_test_pred)
overall_test_report = classification_report(overall_test_true, overall_test_pred)

# Display overall validation metrics
print("\n\n--- Overall Validation Metrics ---")
print(f"Accuracy: {overall_valid_accuracy}")
print(f"F1 Score: {overall_valid_f1}")
print(f"MCC: {overall_valid_mcc}")
print("Confusion Matrix:\n", overall_valid_cmd)
print("Classification Report:\n", overall_valid_report)

# Display overall test metrics
print("\n\n--- Overall Test Metrics ---")
print(f"Accuracy: {overall_test_accuracy}")
print(f"F1 Score: {overall_test_f1}")
print(f"MCC: {overall_test_mcc}")
print("Confusion Matrix:\n", overall_test_cmd)
print("Classification Report:\n", overall_test_report)

# Plot confusion matrices
ConfusionMatrixDisplay(confusion_matrix=overall_valid_cmd, display_labels=['Human', 'Generated']).plot()
ConfusionMatrixDisplay(confusion_matrix=overall_test_cmd, display_labels=['Human', 'Generated']).plot()

In [None]:
import numpy as np
import tensorflow as tf
from sklearn.model_selection import KFold
from sklearn.metrics import (
    accuracy_score, f1_score, matthews_corrcoef, 
    confusion_matrix, classification_report, ConfusionMatrixDisplay
)

def build_rnn_model():
    """Build and compile the RNN-like dense model."""
    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(train_x_tfidf.shape[1],)),
        tf.keras.layers.Dense(512, activation='relu'),
        tf.keras.layers.Dense(256, activation='relu'),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    model.compile(
        loss=tf.keras.losses.BinaryCrossentropy(),
        optimizer=tf.keras.optimizers.Adam(1e-4),
        metrics=['accuracy']
    )
    return model

# K-Fold Cross-Validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Store overall predictions and true labels for validation and test sets
overall_valid_true, overall_valid_pred = [], []
overall_test_true, overall_test_pred = [], []

fold = 1

# Perform K-Fold Cross-Validation
for train_index, val_index in kf.split(train_x_tfidf):
    print(f"\n\n--- Fold {fold} ---")

    # Split data for the current fold
    X_train, X_val = train_x_tfidf[train_index], train_x_tfidf[val_index]
    y_train, y_val = train_y[train_index], train_y[val_index]

    # Build and train the model
    model = build_rnn_model()
    model.fit(
        X_train,
        y_train,
        epochs=5,
        validation_data=(valid_x_tfidf, valid_y),
        callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)]
    )

    # Validation predictions
    val_pred_prob = model.predict(X_val)
    val_pred_y = (val_pred_prob > 0.5).astype(int).flatten()
    overall_valid_true.extend(y_val)
    overall_valid_pred.extend(val_pred_y)

    # Test predictions
    test_pred_prob = model.predict(test_x_tfidf)
    test_pred_y = (test_pred_prob > 0.5).astype(int).flatten()
    overall_test_true.extend(test_y)
    overall_test_pred.extend(test_pred_y)

    fold += 1

# Compute overall metrics for validation
overall_valid_accuracy = accuracy_score(overall_valid_true, overall_valid_pred)
overall_valid_f1 = f1_score(overall_valid_true, overall_valid_pred)
overall_valid_mcc = matthews_corrcoef(overall_valid_true, overall_valid_pred)
overall_valid_cmd = confusion_matrix(overall_valid_true, overall_valid_pred)
overall_valid_report = classification_report(overall_valid_true, overall_valid_pred)

# Compute overall metrics for test
overall_test_accuracy = accuracy_score(overall_test_true, overall_test_pred)
overall_test_f1 = f1_score(overall_test_true, overall_test_pred)
overall_test_mcc = matthews_corrcoef(overall_test_true, overall_test_pred)
overall_test_cmd = confusion_matrix(overall_test_true, overall_test_pred)
overall_test_report = classification_report(overall_test_true, overall_test_pred)

# Display overall validation metrics
print("\n\n--- Overall Validation Metrics ---")
print(f"Accuracy: {overall_valid_accuracy}")
print(f"F1 Score: {overall_valid_f1}")
print(f"MCC: {overall_valid_mcc}")
print("Confusion Matrix:\n", overall_valid_cmd)
print("Classification Report:\n", overall_valid_report)

# Display overall test metrics
print("\n\n--- Overall Test Metrics ---")
print(f"Accuracy: {overall_test_accuracy}")
print(f"F1 Score: {overall_test_f1}")
print(f"MCC: {overall_test_mcc}")
print("Confusion Matrix:\n", overall_test_cmd)
print("Classification Report:\n", overall_test_report)

# Plot confusion matrices
ConfusionMatrixDisplay(confusion_matrix=overall_valid_cmd, display_labels=['Human', 'Generated']).plot()
ConfusionMatrixDisplay(confusion_matrix=overall_test_cmd, display_labels=['Human', 'Generated']).plot()

In [None]:
import numpy as np
from sklearn import svm
from sklearn.model_selection import KFold
from sklearn.metrics import (
    accuracy_score, f1_score, matthews_corrcoef, 
    confusion_matrix, classification_report, ConfusionMatrixDisplay
)

kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Store overall predictions and true labels for validation and test sets
overall_valid_true, overall_valid_pred = [], []
overall_test_true, overall_test_pred = [], []

fold = 1

# Perform K-Fold Cross-Validation
for train_index, val_index in kf.split(train_x_tfidf):
    print(f"\n\n--- Fold {fold} ---")

    # Split data for the current fold
    X_train, X_val = train_x_tfidf[train_index], train_x_tfidf[val_index]
    y_train, y_val = train_y[train_index], train_y[val_index]

    # Train the SVM model
    svm_model = svm.SVC(kernel='linear', probability=True)
    svm_model.fit(X_train, y_train)

    # Validation predictions
    val_pred_y = svm_model.predict(X_val)
    overall_valid_true.extend(y_val)
    overall_valid_pred.extend(val_pred_y)

    # Test predictions
    test_pred_y = svm_model.predict(test_x_tfidf)
    overall_test_true.extend(test_y)
    overall_test_pred.extend(test_pred_y)

    fold += 1

# Compute overall metrics for validation
overall_valid_accuracy = accuracy_score(overall_valid_true, overall_valid_pred)
overall_valid_f1 = f1_score(overall_valid_true, overall_valid_pred)
overall_valid_mcc = matthews_corrcoef(overall_valid_true, overall_valid_pred)
overall_valid_cmd = confusion_matrix(overall_valid_true, overall_valid_pred)
overall_valid_report = classification_report(overall_valid_true, overall_valid_pred)

# Compute overall metrics for test
overall_test_accuracy = accuracy_score(overall_test_true, overall_test_pred)
overall_test_f1 = f1_score(overall_test_true, overall_test_pred)
overall_test_mcc = matthews_corrcoef(overall_test_true, overall_test_pred)
overall_test_cmd = confusion_matrix(overall_test_true, overall_test_pred)
overall_test_report = classification_report(overall_test_true, overall_test_pred)

# Display overall validation metrics
print("\n\n--- Overall Validation Metrics ---")
print(f"Accuracy: {overall_valid_accuracy}")
print(f"F1 Score: {overall_valid_f1}")
print(f"MCC: {overall_valid_mcc}")
print("Confusion Matrix:\n", overall_valid_cmd)
print("Classification Report:\n", overall_valid_report)

# Display overall test metrics
print("\n\n--- Overall Test Metrics ---")
print(f"Accuracy: {overall_test_accuracy}")
print(f"F1 Score: {overall_test_f1}")
print(f"MCC: {overall_test_mcc}")
print("Confusion Matrix:\n", overall_test_cmd)
print("Classification Report:\n", overall_test_report)

# Plot confusion matrices
ConfusionMatrixDisplay(confusion_matrix=overall_valid_cmd, display_labels=['Human', 'Generated']).plot()
ConfusionMatrixDisplay(confusion_matrix=overall_test_cmd, display_labels=['Human', 'Generated']).plot()

joblib.dump(svm_model, 'svm_model_Kfold.joblib')

In [None]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import (
    accuracy_score, f1_score, matthews_corrcoef, 
    confusion_matrix, classification_report, ConfusionMatrixDisplay
)

# K-Fold Cross-Validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Store overall predictions and true labels for validation and test sets
overall_valid_true, overall_valid_pred = [], []
overall_test_true, overall_test_pred = [], []

fold = 1

# Perform K-Fold Cross-Validation
for train_index, val_index in kf.split(train_x_tfidf):
    print(f"\n\n--- Fold {fold} ---")

    # Split data for the current fold
    X_train, X_val = train_x_tfidf[train_index], train_x_tfidf[val_index]
    y_train, y_val = train_y[train_index], train_y[val_index]

    # Train the KNN model
    knn_model = KNeighborsClassifier(n_neighbors=5)
    knn_model.fit(X_train, y_train)

    # Validation predictions
    val_pred_y = knn_model.predict(X_val)
    overall_valid_true.extend(y_val)
    overall_valid_pred.extend(val_pred_y)

    # Test predictions
    test_pred_y = knn_model.predict(test_x_tfidf)
    overall_test_true.extend(test_y)
    overall_test_pred.extend(test_pred_y)

    fold += 1

# Compute overall metrics for validation
overall_valid_accuracy = accuracy_score(overall_valid_true, overall_valid_pred)
overall_valid_f1 = f1_score(overall_valid_true, overall_valid_pred)
overall_valid_mcc = matthews_corrcoef(overall_valid_true, overall_valid_pred)
overall_valid_cmd = confusion_matrix(overall_valid_true, overall_valid_pred)
overall_valid_report = classification_report(overall_valid_true, overall_valid_pred)

# Compute overall metrics for test
overall_test_accuracy = accuracy_score(overall_test_true, overall_test_pred)
overall_test_f1 = f1_score(overall_test_true, overall_test_pred)
overall_test_mcc = matthews_corrcoef(overall_test_true, overall_test_pred)
overall_test_cmd = confusion_matrix(overall_test_true, overall_test_pred)
overall_test_report = classification_report(overall_test_true, overall_test_pred)

# Display overall validation metrics
print("\n\n--- Overall Validation Metrics ---")
print(f"Accuracy: {overall_valid_accuracy}")
print(f"F1 Score: {overall_valid_f1}")
print(f"MCC: {overall_valid_mcc}")
print("Confusion Matrix:\n", overall_valid_cmd)
print("Classification Report:\n", overall_valid_report)

# Display overall test metrics
print("\n\n--- Overall Test Metrics ---")
print(f"Accuracy: {overall_test_accuracy}")
print(f"F1 Score: {overall_test_f1}")
print(f"MCC: {overall_test_mcc}")
print("Confusion Matrix:\n", overall_test_cmd)
print("Classification Report:\n", overall_test_report)

# Plot confusion matrices
ConfusionMatrixDisplay(confusion_matrix=overall_valid_cmd, display_labels=['Human', 'Generated']).plot()
ConfusionMatrixDisplay(confusion_matrix=overall_test_cmd, display_labels=['Human', 'Generated']).plot()

joblib.dump(knn_model, 'knn_model_Kfold.joblib')

In [None]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import (
    accuracy_score, f1_score, matthews_corrcoef, 
    confusion_matrix, classification_report, ConfusionMatrixDisplay
)

# K-Fold Cross-Validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Store overall predictions and true labels for validation and test sets
overall_valid_true, overall_valid_pred = [], []
overall_test_true, overall_test_pred = [], []

fold = 1

# Perform K-Fold Cross-Validation
for train_index, val_index in kf.split(train_x_tfidf):
    print(f"\n\n--- Fold {fold} ---")

    # Split data for the current fold
    X_train, X_val = train_x_tfidf[train_index], train_x_tfidf[val_index]
    y_train, y_val = train_y[train_index], train_y[val_index]

    # Train the Decision Tree model
    dt_model = DecisionTreeClassifier(random_state=42)
    dt_model.fit(X_train, y_train)

    # Validation predictions
    val_pred_y = dt_model.predict(X_val)
    overall_valid_true.extend(y_val)
    overall_valid_pred.extend(val_pred_y)

    # Test predictions
    test_pred_y = dt_model.predict(test_x_tfidf)
    overall_test_true.extend(test_y)
    overall_test_pred.extend(test_pred_y)

    fold += 1

# Compute overall metrics for validation
overall_valid_accuracy = accuracy_score(overall_valid_true, overall_valid_pred)
overall_valid_f1 = f1_score(overall_valid_true, overall_valid_pred)
overall_valid_mcc = matthews_corrcoef(overall_valid_true, overall_valid_pred)
overall_valid_cmd = confusion_matrix(overall_valid_true, overall_valid_pred)
overall_valid_report = classification_report(overall_valid_true, overall_valid_pred)

# Compute overall metrics for test
overall_test_accuracy = accuracy_score(overall_test_true, overall_test_pred)
overall_test_f1 = f1_score(overall_test_true, overall_test_pred)
overall_test_mcc = matthews_corrcoef(overall_test_true, overall_test_pred)
overall_test_cmd = confusion_matrix(overall_test_true, overall_test_pred)
overall_test_report = classification_report(overall_test_true, overall_test_pred)

# Display overall validation metrics
print("\n\n--- Overall Validation Metrics ---")
print(f"Accuracy: {overall_valid_accuracy}")
print(f"F1 Score: {overall_valid_f1}")
print(f"MCC: {overall_valid_mcc}")
print("Confusion Matrix:\n", overall_valid_cmd)
print("Classification Report:\n", overall_valid_report)

# Display overall test metrics
print("\n\n--- Overall Test Metrics ---")
print(f"Accuracy: {overall_test_accuracy}")
print(f"F1 Score: {overall_test_f1}")
print(f"MCC: {overall_test_mcc}")
print("Confusion Matrix:\n", overall_test_cmd)
print("Classification Report:\n", overall_test_report)

# Plot confusion matrices
ConfusionMatrixDisplay(confusion_matrix=overall_valid_cmd, display_labels=['Human', 'Generated']).plot()
ConfusionMatrixDisplay(confusion_matrix=overall_test_cmd, display_labels=['Human', 'Generated']).plot()

joblib.dump(dt_model, 'dt_model_Kfold.joblib')

In [None]:
import numpy as np
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import (
    accuracy_score, f1_score, matthews_corrcoef, 
    confusion_matrix, classification_report, ConfusionMatrixDisplay
)

# K-Fold Cross-Validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Store overall predictions and true labels for validation and test sets
overall_valid_true, overall_valid_pred = [], []
overall_test_true, overall_test_pred = [], []

fold = 1

# Perform K-Fold Cross-Validation
for train_index, val_index in kf.split(train_x_tfidf):
    print(f"\n\n--- Fold {fold} ---")

    # Split data for the current fold
    X_train, X_val = train_x_tfidf[train_index], train_x_tfidf[val_index]
    y_train, y_val = train_y[train_index], train_y[val_index]

    # Train the AdaBoost model
    dt_stump = DecisionTreeClassifier(max_depth=1, random_state=42)
    ada_model = AdaBoostClassifier(estimator=dt_stump, n_estimators=100, algorithm='SAMME', random_state=42)
    ada_model.fit(X_train, y_train)

    # Validation predictions
    val_pred_y = ada_model.predict(X_val)
    overall_valid_true.extend(y_val)
    overall_valid_pred.extend(val_pred_y)

    # Test predictions
    test_pred_y = ada_model.predict(test_x_tfidf)
    overall_test_true.extend(test_y)
    overall_test_pred.extend(test_pred_y)

    fold += 1

# Compute overall metrics for validation
overall_valid_accuracy = accuracy_score(overall_valid_true, overall_valid_pred)
overall_valid_f1 = f1_score(overall_valid_true, overall_valid_pred)
overall_valid_mcc = matthews_corrcoef(overall_valid_true, overall_valid_pred)
overall_valid_cmd = confusion_matrix(overall_valid_true, overall_valid_pred)
overall_valid_report = classification_report(overall_valid_true, overall_valid_pred)

# Compute overall metrics for test
overall_test_accuracy = accuracy_score(overall_test_true, overall_test_pred)
overall_test_f1 = f1_score(overall_test_true, overall_test_pred)
overall_test_mcc = matthews_corrcoef(overall_test_true, overall_test_pred)
overall_test_cmd = confusion_matrix(overall_test_true, overall_test_pred)
overall_test_report = classification_report(overall_test_true, overall_test_pred)

# Display overall validation metrics
print("\n\n--- Overall Validation Metrics ---")
print(f"Accuracy: {overall_valid_accuracy}")
print(f"F1 Score: {overall_valid_f1}")
print(f"MCC: {overall_valid_mcc}")
print("Confusion Matrix:\n", overall_valid_cmd)
print("Classification Report:\n", overall_valid_report)

# Display overall test metrics
print("\n\n--- Overall Test Metrics ---")
print(f"Accuracy: {overall_test_accuracy}")
print(f"F1 Score: {overall_test_f1}")
print(f"MCC: {overall_test_mcc}")
print("Confusion Matrix:\n", overall_test_cmd)
print("Classification Report:\n", overall_test_report)

# Plot confusion matrices
ConfusionMatrixDisplay(confusion_matrix=overall_valid_cmd, display_labels=['Human', 'Generated']).plot()
ConfusionMatrixDisplay(confusion_matrix=overall_test_cmd, display_labels=['Human', 'Generated']).plot()

joblib.dump(ada_model, 'ada_model_Kfold.joblib')

In [None]:
import numpy as np
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import (
    accuracy_score, f1_score, matthews_corrcoef, 
    confusion_matrix, classification_report, ConfusionMatrixDisplay
)

# K-Fold Cross-Validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Store overall predictions and true labels for validation and test sets
overall_valid_true, overall_valid_pred = [], []
overall_test_true, overall_test_pred = [], []

fold = 1

# Perform K-Fold Cross-Validation
for train_index, val_index in kf.split(train_x_tfidf):
    print(f"\n\n--- Fold {fold} ---")

    # Split data for the current fold
    X_train, X_val = train_x_tfidf[train_index], train_x_tfidf[val_index]
    y_train, y_val = train_y[train_index], train_y[val_index]

    # Train the Bagging model
    base_estimator = DecisionTreeClassifier()
    bagging_model = BaggingClassifier(estimator=base_estimator, n_estimators=100, random_state=42)
    bagging_model.fit(X_train, y_train)

    # Validation predictions
    val_pred_y = bagging_model.predict(X_val)
    overall_valid_true.extend(y_val)
    overall_valid_pred.extend(val_pred_y)

    # Test predictions
    test_pred_y = bagging_model.predict(test_x_tfidf)
    overall_test_true.extend(test_y)
    overall_test_pred.extend(test_pred_y)

    fold += 1

# Compute overall metrics for validation
overall_valid_accuracy = accuracy_score(overall_valid_true, overall_valid_pred)
overall_valid_f1 = f1_score(overall_valid_true, overall_valid_pred)
overall_valid_mcc = matthews_corrcoef(overall_valid_true, overall_valid_pred)
overall_valid_cmd = confusion_matrix(overall_valid_true, overall_valid_pred)
overall_valid_report = classification_report(overall_valid_true, overall_valid_pred)

# Compute overall metrics for test
overall_test_accuracy = accuracy_score(overall_test_true, overall_test_pred)
overall_test_f1 = f1_score(overall_test_true, overall_test_pred)
overall_test_mcc = matthews_corrcoef(overall_test_true, overall_test_pred)
overall_test_cmd = confusion_matrix(overall_test_true, overall_test_pred)
overall_test_report = classification_report(overall_test_true, overall_test_pred)

# Display overall validation metrics
print("\n\n--- Overall Validation Metrics ---")
print(f"Accuracy: {overall_valid_accuracy}")
print(f"F1 Score: {overall_valid_f1}")
print(f"MCC: {overall_valid_mcc}")
print("Confusion Matrix:\n", overall_valid_cmd)
print("Classification Report:\n", overall_valid_report)

# Display overall test metrics
print("\n\n--- Overall Test Metrics ---")
print(f"Accuracy: {overall_test_accuracy}")
print(f"F1 Score: {overall_test_f1}")
print(f"MCC: {overall_test_mcc}")
print("Confusion Matrix:\n", overall_test_cmd)
print("Classification Report:\n", overall_test_report)

# Plot confusion matrices
ConfusionMatrixDisplay(confusion_matrix=overall_valid_cmd, display_labels=['Human', 'Generated']).plot()
ConfusionMatrixDisplay(confusion_matrix=overall_test_cmd, display_labels=['Human', 'Generated']).plot()

joblib.dump(bagging_model, 'bagging_model_Kfold.joblib')

In [None]:
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import (
    accuracy_score, f1_score, matthews_corrcoef, 
    confusion_matrix, classification_report, ConfusionMatrixDisplay
)
import joblib

# K-Fold Cross-Validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Store overall predictions and true labels for validation and test sets
overall_valid_true, overall_valid_pred = [], []
overall_test_true, overall_test_pred = [], []

fold = 1

# Perform K-Fold Cross-Validation
for train_index, val_index in kf.split(train_x_tfidf):
    print(f"\n\n--- Fold {fold} ---")

    # Split data for the current fold
    X_train, X_val = train_x_tfidf[train_index], train_x_tfidf[val_index]
    y_train, y_val = train_y[train_index], train_y[val_index]

    # Train the Gradient Boosting model
    gradient_boosting_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
    gradient_boosting_model.fit(X_train, y_train)

    # Validation predictions
    val_pred_y = gradient_boosting_model.predict(X_val)
    overall_valid_true.extend(y_val)
    overall_valid_pred.extend(val_pred_y)

    # Test predictions
    test_pred_y = gradient_boosting_model.predict(test_x_tfidf)
    overall_test_true.extend(test_y)
    overall_test_pred.extend(test_pred_y)

    fold += 1

# Compute overall metrics for validation
overall_valid_accuracy = accuracy_score(overall_valid_true, overall_valid_pred)
overall_valid_f1 = f1_score(overall_valid_true, overall_valid_pred)
overall_valid_mcc = matthews_corrcoef(overall_valid_true, overall_valid_pred)
overall_valid_cmd = confusion_matrix(overall_valid_true, overall_valid_pred)
overall_valid_report = classification_report(overall_valid_true, overall_valid_pred)

# Compute overall metrics for test
overall_test_accuracy = accuracy_score(overall_test_true, overall_test_pred)
overall_test_f1 = f1_score(overall_test_true, overall_test_pred)
overall_test_mcc = matthews_corrcoef(overall_test_true, overall_test_pred)
overall_test_cmd = confusion_matrix(overall_test_true, overall_test_pred)
overall_test_report = classification_report(overall_test_true, overall_test_pred)

# Display overall validation metrics
print("\n\n--- Overall Validation Metrics ---")
print(f"Accuracy: {overall_valid_accuracy}")
print(f"F1 Score: {overall_valid_f1}")
print(f"MCC: {overall_valid_mcc}")
print("Confusion Matrix:\n", overall_valid_cmd)
print("Classification Report:\n", overall_valid_report)

# Display overall test metrics
print("\n\n--- Overall Test Metrics ---")
print(f"Accuracy: {overall_test_accuracy}")
print(f"F1 Score: {overall_test_f1}")
print(f"MCC: {overall_test_mcc}")
print("Confusion Matrix:\n", overall_test_cmd)
print("Classification Report:\n", overall_test_report)

# Plot confusion matrices
ConfusionMatrixDisplay(confusion_matrix=overall_valid_cmd, display_labels=['Human', 'Generated']).plot()
ConfusionMatrixDisplay(confusion_matrix=overall_test_cmd, display_labels=['Human', 'Generated']).plot()

# Save the model
joblib.dump(gradient_boosting_model, 'gradient_boosting_model_Kfold.joblib')

In [None]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import (
    accuracy_score, f1_score, matthews_corrcoef, 
    confusion_matrix, classification_report, ConfusionMatrixDisplay
)
import joblib

# K-Fold Cross-Validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Store overall predictions and true labels for validation and test sets
overall_valid_true, overall_valid_pred = [], []
overall_test_true, overall_test_pred = [], []

fold = 1

# Perform K-Fold Cross-Validation
for train_index, val_index in kf.split(train_x_tfidf):
    print(f"\n\n--- Fold {fold} ---")

    # Split data for the current fold
    X_train, X_val = train_x_tfidf[train_index], train_x_tfidf[val_index]
    y_train, y_val = train_y[train_index], train_y[val_index]

    # Train the Random Forest model
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
    rf_model.fit(X_train, y_train)

    # Validation predictions
    val_pred_y = rf_model.predict(X_val)
    overall_valid_true.extend(y_val)
    overall_valid_pred.extend(val_pred_y)

    # Test predictions
    test_pred_y = rf_model.predict(test_x_tfidf)
    overall_test_true.extend(test_y)
    overall_test_pred.extend(test_pred_y)

    fold += 1

# Compute overall metrics for validation
overall_valid_accuracy = accuracy_score(overall_valid_true, overall_valid_pred)
overall_valid_f1 = f1_score(overall_valid_true, overall_valid_pred)
overall_valid_mcc = matthews_corrcoef(overall_valid_true, overall_valid_pred)
overall_valid_cmd = confusion_matrix(overall_valid_true, overall_valid_pred)
overall_valid_report = classification_report(overall_valid_true, overall_valid_pred)

# Compute overall metrics for test
overall_test_accuracy = accuracy_score(overall_test_true, overall_test_pred)
overall_test_f1 = f1_score(overall_test_true, overall_test_pred)
overall_test_mcc = matthews_corrcoef(overall_test_true, overall_test_pred)
overall_test_cmd = confusion_matrix(overall_test_true, overall_test_pred)
overall_test_report = classification_report(overall_test_true, overall_test_pred)

# Display overall validation metrics
print("\n\n--- Overall Validation Metrics ---")
print(f"Accuracy: {overall_valid_accuracy}")
print(f"F1 Score: {overall_valid_f1}")
print(f"MCC: {overall_valid_mcc}")
print("Confusion Matrix:\n", overall_valid_cmd)
print("Classification Report:\n", overall_valid_report)

# Display overall test metrics
print("\n\n--- Overall Test Metrics ---")
print(f"Accuracy: {overall_test_accuracy}")
print(f"F1 Score: {overall_test_f1}")
print(f"MCC: {overall_test_mcc}")
print("Confusion Matrix:\n", overall_test_cmd)
print("Classification Report:\n", overall_test_report)

# Plot confusion matrices
ConfusionMatrixDisplay(confusion_matrix=overall_valid_cmd, display_labels=['Human', 'Generated']).plot()
ConfusionMatrixDisplay(confusion_matrix=overall_test_cmd, display_labels=['Human', 'Generated']).plot()

# Save the model
joblib.dump(rf_model, 'random_forest_model_Kfold.joblib')

In [None]:
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import (
    accuracy_score, f1_score, matthews_corrcoef, 
    confusion_matrix, classification_report, ConfusionMatrixDisplay
)
import joblib

# K-Fold Cross-Validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Store overall predictions and true labels for validation and test sets
overall_valid_true, overall_valid_pred = [], []
overall_test_true, overall_test_pred = [], []

fold = 1

# Perform K-Fold Cross-Validation
for train_index, val_index in kf.split(train_x_tfidf):
    print(f"\n\n--- Fold {fold} ---")

    # Split data for the current fold
    X_train, X_val = train_x_tfidf[train_index], train_x_tfidf[val_index]
    y_train, y_val = train_y[train_index], train_y[val_index]

    # Train the Extra Trees model
    extra_trees_model = ExtraTreesClassifier(n_estimators=100, random_state=42)
    extra_trees_model.fit(X_train, y_train)

    # Validation predictions
    val_pred_y = extra_trees_model.predict(X_val)
    overall_valid_true.extend(y_val)
    overall_valid_pred.extend(val_pred_y)

    # Test predictions
    test_pred_y = extra_trees_model.predict(test_x_tfidf)
    overall_test_true.extend(test_y)
    overall_test_pred.extend(test_pred_y)

    fold += 1

# Compute overall metrics for validation
overall_valid_accuracy = accuracy_score(overall_valid_true, overall_valid_pred)
overall_valid_f1 = f1_score(overall_valid_true, overall_valid_pred)
overall_valid_mcc = matthews_corrcoef(overall_valid_true, overall_valid_pred)
overall_valid_cmd = confusion_matrix(overall_valid_true, overall_valid_pred)
overall_valid_report = classification_report(overall_valid_true, overall_valid_pred)

# Compute overall metrics for test
overall_test_accuracy = accuracy_score(overall_test_true, overall_test_pred)
overall_test_f1 = f1_score(overall_test_true, overall_test_pred)
overall_test_mcc = matthews_corrcoef(overall_test_true, overall_test_pred)
overall_test_cmd = confusion_matrix(overall_test_true, overall_test_pred)
overall_test_report = classification_report(overall_test_true, overall_test_pred)

# Display overall validation metrics
print("\n\n--- Overall Validation Metrics ---")
print(f"Accuracy: {overall_valid_accuracy}")
print(f"F1 Score: {overall_valid_f1}")
print(f"MCC: {overall_valid_mcc}")
print("Confusion Matrix:\n", overall_valid_cmd)
print("Classification Report:\n", overall_valid_report)

# Display overall test metrics
print("\n\n--- Overall Test Metrics ---")
print(f"Accuracy: {overall_test_accuracy}")
print(f"F1 Score: {overall_test_f1}")
print(f"MCC: {overall_test_mcc}")
print("Confusion Matrix:\n", overall_test_cmd)
print("Classification Report:\n", overall_test_report)

# Plot confusion matrices
ConfusionMatrixDisplay(confusion_matrix=overall_valid_cmd, display_labels=['Human', 'Generated']).plot()
ConfusionMatrixDisplay(confusion_matrix=overall_test_cmd, display_labels=['Human', 'Generated']).plot()

# Save the model
joblib.dump(extra_trees_model, 'extra_trees_model_Kfold.joblib')

In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import (
    accuracy_score, f1_score, matthews_corrcoef, 
    confusion_matrix, classification_report, ConfusionMatrixDisplay
)
import joblib

# K-Fold Cross-Validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Store overall predictions and true labels for validation and test sets
overall_valid_true, overall_valid_pred = [], []
overall_test_true, overall_test_pred = [], []

fold = 1

# Perform K-Fold Cross-Validation
for train_index, val_index in kf.split(train_x_tfidf):
    print(f"\n\n--- Fold {fold} ---")

    # Split data for the current fold
    X_train, X_val = train_x_tfidf[train_index], train_x_tfidf[val_index]
    y_train, y_val = train_y[train_index], train_y[val_index]

    # Train the Logistic Regression model
    log_reg = LogisticRegression(max_iter=1000)
    log_reg.fit(X_train, y_train)

    # Validation predictions
    val_pred_y = log_reg.predict(X_val)
    overall_valid_true.extend(y_val)
    overall_valid_pred.extend(val_pred_y)

    # Test predictions
    test_pred_y = log_reg.predict(test_x_tfidf)
    overall_test_true.extend(test_y)
    overall_test_pred.extend(test_pred_y)

    fold += 1

# Compute overall metrics for validation
overall_valid_accuracy = accuracy_score(overall_valid_true, overall_valid_pred)
overall_valid_f1 = f1_score(overall_valid_true, overall_valid_pred)
overall_valid_mcc = matthews_corrcoef(overall_valid_true, overall_valid_pred)
overall_valid_cmd = confusion_matrix(overall_valid_true, overall_valid_pred)
overall_valid_report = classification_report(overall_valid_true, overall_valid_pred)

# Compute overall metrics for test
overall_test_accuracy = accuracy_score(overall_test_true, overall_test_pred)
overall_test_f1 = f1_score(overall_test_true, overall_test_pred)
overall_test_mcc = matthews_corrcoef(overall_test_true, overall_test_pred)
overall_test_cmd = confusion_matrix(overall_test_true, overall_test_pred)
overall_test_report = classification_report(overall_test_true, overall_test_pred)

# Display overall validation metrics
print("\n\n--- Overall Validation Metrics ---")
print(f"Accuracy: {overall_valid_accuracy}")
print(f"F1 Score: {overall_valid_f1}")
print(f"MCC: {overall_valid_mcc}")
print("Confusion Matrix:\n", overall_valid_cmd)
print("Classification Report:\n", overall_valid_report)

# Display overall test metrics
print("\n\n--- Overall Test Metrics ---")
print(f"Accuracy: {overall_test_accuracy}")
print(f"F1 Score: {overall_test_f1}")
print(f"MCC: {overall_test_mcc}")
print("Confusion Matrix:\n", overall_test_cmd)
print("Classification Report:\n", overall_test_report)

# Plot confusion matrices
ConfusionMatrixDisplay(confusion_matrix=overall_valid_cmd, display_labels=['Human', 'Generated']).plot()
ConfusionMatrixDisplay(confusion_matrix=overall_test_cmd, display_labels=['Human', 'Generated']).plot()

# Save the model
joblib.dump(log_reg, 'logistic_regression_model_Kfold.joblib')

In [None]:
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import (
    accuracy_score, f1_score, matthews_corrcoef, 
    confusion_matrix, classification_report, ConfusionMatrixDisplay
)
import joblib

# K-Fold Cross-Validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Store overall predictions and true labels for validation and test sets
overall_valid_true, overall_valid_pred = [], []
overall_test_true, overall_test_pred = [], []

fold = 1

# Perform K-Fold Cross-Validation
for train_index, val_index in kf.split(train_x_tfidf):
    print(f"\n\n--- Fold {fold} ---")

    # Split data for the current fold
    X_train, X_val = train_x_tfidf[train_index], train_x_tfidf[val_index]
    y_train, y_val = train_y[train_index], train_y[val_index]

    # Train the MLP model
    mlp_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)
    mlp_model.fit(X_train, y_train)

    # Validation predictions
    val_pred_y = mlp_model.predict(X_val)
    overall_valid_true.extend(y_val)
    overall_valid_pred.extend(val_pred_y)

    # Test predictions
    test_pred_y = mlp_model.predict(test_x_tfidf)
    overall_test_true.extend(test_y)
    overall_test_pred.extend(test_pred_y)

    fold += 1

# Compute overall metrics for validation
overall_valid_accuracy = accuracy_score(overall_valid_true, overall_valid_pred)
overall_valid_f1 = f1_score(overall_valid_true, overall_valid_pred)
overall_valid_mcc = matthews_corrcoef(overall_valid_true, overall_valid_pred)
overall_valid_cmd = confusion_matrix(overall_valid_true, overall_valid_pred)
overall_valid_report = classification_report(overall_valid_true, overall_valid_pred)

# Compute overall metrics for test
overall_test_accuracy = accuracy_score(overall_test_true, overall_test_pred)
overall_test_f1 = f1_score(overall_test_true, overall_test_pred)
overall_test_mcc = matthews_corrcoef(overall_test_true, overall_test_pred)
overall_test_cmd = confusion_matrix(overall_test_true, overall_test_pred)
overall_test_report = classification_report(overall_test_true, overall_test_pred)

# Display overall validation metrics
print("\n\n--- Overall Validation Metrics ---")
print(f"Accuracy: {overall_valid_accuracy}")
print(f"F1 Score: {overall_valid_f1}")
print(f"MCC: {overall_valid_mcc}")
print("Confusion Matrix:\n", overall_valid_cmd)
print("Classification Report:\n", overall_valid_report)

# Display overall test metrics
print("\n\n--- Overall Test Metrics ---")
print(f"Accuracy: {overall_test_accuracy}")
print(f"F1 Score: {overall_test_f1}")
print(f"MCC: {overall_test_mcc}")
print("Confusion Matrix:\n", overall_test_cmd)
print("Classification Report:\n", overall_test_report)

# Plot confusion matrices
ConfusionMatrixDisplay(confusion_matrix=overall_valid_cmd, display_labels=['Human', 'Generated']).plot()
ConfusionMatrixDisplay(confusion_matrix=overall_test_cmd, display_labels=['Human', 'Generated']).plot()

# Save the model
joblib.dump(mlp_model, 'mlp_model_Kfold.joblib')

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import KFold
from sklearn.metrics import (
    accuracy_score, f1_score, matthews_corrcoef, 
    confusion_matrix, classification_report, ConfusionMatrixDisplay
)
import numpy as np
import joblib

kf = KFold(n_splits=5, shuffle=True, random_state=42)

overall_valid_true, overall_valid_pred = [], []
overall_test_true, overall_test_pred = [], []

fold = 1

for train_index, val_index in kf.split(train_x_tfidf):
    print(f"\n\n--- Fold {fold} ---")

    X_train, X_val = train_x_tfidf[train_index], train_x_tfidf[val_index]
    y_train, y_val = train_y[train_index], train_y[val_index]

    lda_model = LatentDirichletAllocation(n_components=10, max_iter=10, random_state=42)
    X_train_lda = lda_model.fit_transform(X_train)
    X_val_lda = lda_model.transform(X_val)
    test_x_lda = lda_model.transform(test_x_tfidf)

    val_pred_y = np.argmax(X_val_lda, axis=1)
    overall_valid_true.extend(y_val)
    overall_valid_pred.extend(val_pred_y)

    test_pred_y = np.argmax(test_x_lda, axis=1)
    overall_test_true.extend(test_y)
    overall_test_pred.extend(test_pred_y)

    fold += 1

overall_valid_accuracy = accuracy_score(overall_valid_true, overall_valid_pred)
overall_valid_f1 = f1_score(overall_valid_true, overall_valid_pred, average='weighted')
overall_valid_mcc = matthews_corrcoef(overall_valid_true, overall_valid_pred)
overall_valid_cmd = confusion_matrix(overall_valid_true, overall_valid_pred)
overall_valid_report = classification_report(overall_valid_true, overall_valid_pred)

overall_test_accuracy = accuracy_score(overall_test_true, overall_test_pred)
overall_test_f1 = f1_score(overall_test_true, overall_test_pred, average='weighted')
overall_test_mcc = matthews_corrcoef(overall_test_true, overall_test_pred)
overall_test_cmd = confusion_matrix(overall_test_true, overall_test_pred)
overall_test_report = classification_report(overall_test_true, overall_test_pred)

print("\n\n--- Overall Validation Metrics ---")
print(f"Accuracy: {overall_valid_accuracy}")
print(f"F1 Score: {overall_valid_f1}")
print(f"MCC: {overall_valid_mcc}")
print("Confusion Matrix:\n", overall_valid_cmd)
print("Classification Report:\n", overall_valid_report)

print("\n\n--- Overall Test Metrics ---")
print(f"Accuracy: {overall_test_accuracy}")
print(f"F1 Score: {overall_test_f1}")
print(f"MCC: {overall_test_mcc}")
print("Confusion Matrix:\n", overall_test_cmd)
print("Classification Report:\n", overall_test_report)

ConfusionMatrixDisplay(confusion_matrix=overall_valid_cmd, display_labels=['Human', 'Generated']).plot()
ConfusionMatrixDisplay(confusion_matrix=overall_test_cmd, display_labels=['Human', 'Generated']).plot()

joblib.dump(lda_model, 'lda_model_Kfold.joblib')