In [None]:
!pip install scikit-learn pandas numpy nltk tensorflow spacy tqdm

In [None]:
import string
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import nltk
import re
import tensorflow as tf
import json
import csv
import spacy
from tqdm import tqdm
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, classification_report
import pandas as pd

In [None]:
def train_dev_jsonl_to_csv(jsonl_file, csv_file):
    csv_columns = ['id', 'text', 'label']
    
    with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
        writer.writeheader()
        
        with open(jsonl_file, 'r', encoding='utf-8') as f:
            for line in f:
                data = json.loads(line) 
                filtered_data = {key: data[key] for key in csv_columns}                
                writer.writerow(filtered_data)

def test_jsonl_to_csv(jsonl_file, csv_file):
    csv_columns = ['id', 'text']
    
    with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
        writer.writeheader()
        
        with open(jsonl_file, 'r', encoding='utf-8') as f:
            for line in f:
                data = json.loads(line)
                writer.writerow(data)

In [None]:
en_train_jsonl_file = 'en_train.jsonl'
en_train_csv_file = 'en_train.csv'
train_dev_jsonl_to_csv(en_train_jsonl_file, en_train_csv_file)
print(f"Data successfully written to {en_train_csv_file}")

en_dev_jsonl_file = 'en_dev.jsonl'
en_dev_csv_file = 'en_dev.csv'
train_dev_jsonl_to_csv(en_dev_jsonl_file, en_dev_csv_file)
print(f"Data successfully written to {en_dev_csv_file}")

en_test_jsonl_file = 'en_devtest_text_id_only.jsonl'
en_test_csv_file = 'en_devtest.csv'
test_jsonl_to_csv(en_test_jsonl_file, en_test_csv_file)
print(f"Data successfully written to {en_test_csv_file}")

# add datasets
train = pd.read_csv("en_train.csv")
dev = pd.read_csv("en_dev.csv")

In [None]:
train['label'] = train['label'].astype(float)
dev['label'] = dev['label'].astype(float)

In [None]:
train

In [None]:
dev

In [None]:
class TextPreprocessor:
    def _init_(self, data):
        self.data = data
        self.nlp = spacy.load('en_core_web_sm')
        self.stemmer = SnowballStemmer('english')
    
    def clean_text(self, text):
        """Clean the input text by removing URLs, mentions, hashtags, numbers, punctuations, etc."""
        text = re.sub(r"@\S+", "", text)  
        text = re.sub(r"http[s]?\://\S+", "", text) 
        text = re.sub(r"#\S+", "", text)  
        text = re.sub(r"[0-9]", "", text) 
        text = re.sub(r"[\[\]()]", "", text)
        text = re.sub(r"\n", "", text)  
        text = text.translate(str.maketrans('', '', string.punctuation)) 
        text = re.sub(r'[^\w\s]', '', text) 
        text = text.lower()  
        text = re.sub(r"\s+", " ", text).strip()  
        return text if text else "no text"
    
    def lemmatize_sentence(self, sentence):
        """Apply lemmatization to a sentence using SpaCy."""
        doc = self.nlp(sentence)
        lemmatized_sentence = " ".join([token.lemma_ for token in doc])
        return lemmatized_sentence
    
    def stem_sentence(self, sentence):
        """Apply stemming to a sentence using NLTK's Snowball Stemmer."""
        words = sentence.split()
        stemmed_words = [self.stemmer.stem(word) for word in words]
        stemmed_sentence = " ".join(stemmed_words)
        return stemmed_sentence
    
    def pos_tagging(self, sentence):
        """Perform POS tagging on a sentence using SpaCy."""
        doc = self.nlp(sentence)
        pos_tags = [token.pos_ for token in doc]
        return " ".join(pos_tags)
    
    def process_data(self):
        """Process the entire DataFrame, applying cleaning, lemmatization, stemming, and POS tagging."""
        # Clean the text
        self.data['clean_text'] = self.data['text'].apply(self.clean_text)
        
        # Lemmatize the text
        lemmatized_text = []
        for sentence in tqdm(self.data['clean_text'], desc='Lemmatizing'):
            lemmatized_sentence = self.lemmatize_sentence(sentence)
            lemmatized_text.append(lemmatized_sentence)
        self.data['lemmatized_text'] = lemmatized_text
        
        # Stem the text
        stemmed_text = []
        for sentence in tqdm(self.data['clean_text'], desc='Stemming'):
            stemmed_sentence = self.stem_sentence(sentence)
            stemmed_text.append(stemmed_sentence)
        self.data['stemmed_text'] = stemmed_text
        
        # POS Tagging
        pos_tags = []
        for sentence in tqdm(self.data['clean_text'], desc='POS tagging'):
            pos_sentence = self.pos_tagging(sentence)
            pos_tags.append(pos_sentence)
        self.data['pos'] = pos_tags

        self.data['combined_text'] = data['clean_text'] + ' ' + data['lemmatized_text'] + ' ' + data['stemmed_text'] + ' ' + data['pos']
    
    def get_processed_data(self):
        """Return the processed DataFrame with clean_text, lemmatized_text, stemmed_text, and pos columns."""
        return self.data

In [None]:
train_preprocessor = TextPreprocessor(train)
dev_preprocessor = TextPreprocessor(dev)

train_preprocessor.process_data()
dev_preprocessor.process_data()

In [None]:
train_data = train_preprocessor.get_processed_data()
dev_data = dev_preprocessor.get_processed_data()
train_data['combined_text'] = train_data['clean_text'] + ' ' + train_data['lemmatized_text'] + ' ' + train_data['stemmed_text'] + ' ' + train_data['pos']
dev_data['combined_text'] = dev_data['clean_text'] + ' ' + dev_data['lemmatized_text'] + ' ' + dev_data['stemmed_text'] + ' ' + dev_data['pos']

In [None]:
train_data

In [None]:
dev_data

In [None]:
# TF-IDF Vectorization using all combined text features
tfidf = TfidfVectorizer(max_features=5000)
train_x_tfidf = tfidf.fit_transform(train_x).toarray()
valid_x_tfidf = tfidf.transform(valid_x).toarray()
test_x_tfidf = tfidf.transform(test_x).toarray()

# Convert labels to numpy arrays
train_y = np.array(train_y)
valid_y = np.array(valid_y)
test_y = np.array(test_y)

# Create TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((train_x_tfidf, train_y)).batch(64).prefetch(tf.data.AUTOTUNE)
valid_dataset = tf.data.Dataset.from_tensor_slices((valid_x_tfidf, valid_y)).batch(64).prefetch(tf.data.AUTOTUNE)
test_dataset = tf.data.Dataset.from_tensor_slices((test_x_tfidf, test_y)).batch(64).prefetch(tf.data.AUTOTUNE)

In [None]:
### CNN Model

import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout, Conv1D, MaxPooling1D, Flatten
from sklearn.metrics import (
    accuracy_score, confusion_matrix, ConfusionMatrixDisplay, 
    classification_report, f1_score, matthews_corrcoef
)

model_CNN = Sequential()
model_CNN.add(Conv1D(filters=128, kernel_size=3, activation='relu', input_shape=(train_x_tfidf.shape[1], 1)))
model_CNN.add(MaxPooling1D(pool_size=2))
model_CNN.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
model_CNN.add(MaxPooling1D(pool_size=2))
model_CNN.add(Flatten())
model_CNN.add(Dense(128, activation='relu'))
model_CNN.add(Dropout(0.5))
model_CNN.add(Dense(1, activation='sigmoid'))  

model_CNN.compile(loss='binary_crossentropy',  
                  optimizer='adam',
                  metrics=['accuracy'])

train_x_tfidf_reshaped = train_x_tfidf.reshape((train_x_tfidf.shape[0], train_x_tfidf.shape[1], 1))
valid_x_tfidf_reshaped = valid_x_tfidf.reshape((valid_x_tfidf.shape[0], valid_x_tfidf.shape[1], 1))
test_x_tfidf_reshaped = test_x_tfidf.reshape((test_x_tfidf.shape[0], test_x_tfidf.shape[1], 1))

model_CNN.fit(train_x_tfidf_reshaped, train_y, validation_data=(valid_x_tfidf_reshaped, valid_y), epochs=5)

valid_pred_y = (model_CNN.predict(valid_x_tfidf_reshaped) > 0.5).astype("int32").flatten()

print("Validation Accuracy: ", accuracy_score(valid_y, valid_pred_y))
print("Validation F1 Score: ", f1_score(valid_y, valid_pred_y))
print("Validation MCC: ", matthews_corrcoef(valid_y, valid_pred_y))

cmd_valid_cnn = ConfusionMatrixDisplay(confusion_matrix(valid_y, valid_pred_y), display_labels=['Human', 'Generated'])
cmd_valid_cnn.plot()

report_cnn_valid = classification_report(valid_y, valid_pred_y)
print(report_cnn_valid)

test_pred_y = (model_CNN.predict(test_x_tfidf_reshaped) > 0.5).astype("int32").flatten()

print("Test Accuracy: ", accuracy_score(test_y, test_pred_y))
print("Test F1 Score: ", f1_score(test_y, test_pred_y))
print("Test MCC: ", matthews_corrcoef(test_y, test_pred_y))

cmd_cnn_test = ConfusionMatrixDisplay(confusion_matrix(test_y, test_pred_y), display_labels=['Human', 'Generated'])
cmd_cnn_test.plot()

report_cnn_test = classification_report(test_y, test_pred_y)
print(report_cnn_test)

In [None]:
model_CNN.save('model_CNN_Traditional.h5')

In [None]:
### LSTM model

import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
    classification_report,
    f1_score,
    matthews_corrcoef
)

model_LSTM = Sequential()
model_LSTM.add(Dense(128, activation='relu', input_shape=(train_x_tfidf.shape[1],)))
model_LSTM.add(Dropout(0.5))
model_LSTM.add(Dense(64, activation='relu'))
model_LSTM.add(Dropout(0.5))
model_LSTM.add(Dense(1, activation='sigmoid'))
model_LSTM.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model_LSTM.fit(train_dataset, validation_data=valid_dataset, epochs=5)

valid_pred_y = (model_LSTM.predict(valid_dataset) > 0.5).astype("int32").flatten()
f1_valid = f1_score(valid_y, valid_pred_y)
mcc_valid = matthews_corrcoef(valid_y, valid_pred_y)

print("Validation Accuracy: ", accuracy_score(valid_y, valid_pred_y))
print("Validation F1 Score: ", f1_valid)
print("Validation MCC: ", mcc_valid)

cmd_valid = ConfusionMatrixDisplay(confusion_matrix(valid_y, valid_pred_y), display_labels=['Human', 'Generated'])
cmd_valid.plot()

report_valid = classification_report(valid_y, valid_pred_y)
print("Validation Classification Report:\n", report_valid)

test_pred_y = (model_LSTM.predict(test_dataset) > 0.5).astype("int32").flatten()
f1_test = f1_score(test_y, test_pred_y)
mcc_test = matthews_corrcoef(test_y, test_pred_y)

print("Test Accuracy: ", accuracy_score(test_y, test_pred_y))
print("Test F1 Score: ", f1_test)
print("Test MCC: ", mcc_test)

cmd_test = ConfusionMatrixDisplay(confusion_matrix(test_y, test_pred_y), display_labels=['Human', 'Generated'])
cmd_test.plot()

report_test = classification_report(test_y, test_pred_y)
print("Test Classification Report:\n", report_test)

model_LSTM.save('model_LSTM_Traditional.h5')

In [None]:
### RNN model

import tensorflow as tf
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
    classification_report,
    f1_score,
    matthews_corrcoef
)

model_RNN = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(train_x_tfidf.shape[1],)),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid') 
])

model_RNN.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(1e-4),
    metrics=['accuracy']
)

model_RNN.fit(
    train_dataset,
    epochs=5,
    validation_data=valid_dataset,
    callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)]
)

model_RNN.save('model_RNN_Traditional.h5')

valid_pred_prob = model_RNN.predict(valid_dataset)
valid_pred_y = (valid_pred_prob > 0.5).astype(int).flatten()

print("Validation Accuracy: ", accuracy_score(valid_y, valid_pred_y))
print("Validation F1 Score: ", f1_score(valid_y, valid_pred_y))
print("Validation MCC: ", matthews_corrcoef(valid_y, valid_pred_y))

cmd_valid = ConfusionMatrixDisplay(confusion_matrix(valid_y, valid_pred_y), display_labels=['Human', 'Generated'])
cmd_valid.plot()

report_valid = classification_report(valid_y, valid_pred_y)
print("Validation Classification Report:\n", report_valid)

test_pred_prob = model_RNN.predict(test_dataset)
test_pred_y = (test_pred_prob > 0.5).astype(int).flatten()

print("Test Accuracy: ", accuracy_score(test_y, test_pred_y))
print("Test F1 Score: ", f1_score(test_y, test_pred_y))
print("Test MCC: ", matthews_corrcoef(test_y, test_pred_y))

cmd_test = ConfusionMatrixDisplay(confusion_matrix(test_y, test_pred_y), display_labels=['Human', 'Generated'])
cmd_test.plot()

report_test = classification_report(test_y, test_pred_y)
print("Test Classification Report:\n", report_test)

In [None]:
### SVM model

from sklearn import svm
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
    classification_report,
    f1_score,
    matthews_corrcoef
)
import joblib  

svm_model = svm.SVC(kernel='linear', probability=True)
svm_model.fit(train_x_tfidf, train_y)

valid_pred_y = svm_model.predict(valid_x_tfidf)

print("Validation Accuracy: ", accuracy_score(valid_y, valid_pred_y))
print("Validation F1 Score: ", f1_score(valid_y, valid_pred_y))
print("Validation MCC: ", matthews_corrcoef(valid_y, valid_pred_y))

cmd_valid = ConfusionMatrixDisplay(confusion_matrix(valid_y, valid_pred_y), display_labels=['Human', 'Generated'])
cmd_valid.plot()

report_valid = classification_report(valid_y, valid_pred_y)
print("Validation Classification Report:\n", report_valid)

test_pred_y = svm_model.predict(test_x_tfidf)

print("Test Accuracy: ", accuracy_score(test_y, test_pred_y))
print("Test F1 Score: ", f1_score(test_y, test_pred_y))
print("Test MCC: ", matthews_corrcoef(test_y, test_pred_y))

cmd_test = ConfusionMatrixDisplay(confusion_matrix(test_y, test_pred_y), display_labels=['Human', 'Generated'])
cmd_test.plot()

report_test = classification_report(test_y, test_pred_y)
print("Test Classification Report:\n", report_test)

joblib.dump(svm_model, 'svm_model_Traditional.joblib')

In [None]:
### KNN model

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
    classification_report,
    f1_score,
    matthews_corrcoef
)
import joblib  

knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(train_x_tfidf, train_y)

valid_pred_y = knn_model.predict(valid_x_tfidf)

print("Validation Accuracy: ", accuracy_score(valid_y, valid_pred_y))
print("Validation F1 Score: ", f1_score(valid_y, valid_pred_y))
print("Validation MCC: ", matthews_corrcoef(valid_y, valid_pred_y))

cmd_valid = ConfusionMatrixDisplay(confusion_matrix(valid_y, valid_pred_y), display_labels=['Human', 'Generated'])
cmd_valid.plot()

report_valid = classification_report(valid_y, valid_pred_y)
print("Validation Classification Report:\n", report_valid)

test_pred_y = knn_model.predict(test_x_tfidf)

print("Test Accuracy: ", accuracy_score(test_y, test_pred_y))
print("Test F1 Score: ", f1_score(test_y, test_pred_y))
print("Test MCC: ", matthews_corrcoef(test_y, test_pred_y))

cmd_test = ConfusionMatrixDisplay(confusion_matrix(test_y, test_pred_y), display_labels=['Human', 'Generated'])
cmd_test.plot()

report_test = classification_report(test_y, test_pred_y)
print("Test Classification Report:\n", report_test)

joblib.dump(knn_model, 'knn_model_Traditional.joblib')

In [None]:
### Decision Tree model

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
    classification_report,
    f1_score,
    matthews_corrcoef
)
import joblib  

dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(train_x_tfidf, train_y)

valid_pred_y = dt_model.predict(valid_x_tfidf)

print("Validation Accuracy: ", accuracy_score(valid_y, valid_pred_y))
print("Validation F1 Score: ", f1_score(valid_y, valid_pred_y))
print("Validation MCC: ", matthews_corrcoef(valid_y, valid_pred_y))

cmd_valid = ConfusionMatrixDisplay(confusion_matrix(valid_y, valid_pred_y), display_labels=['Human', 'Generated'])
cmd_valid.plot()

report_valid = classification_report(valid_y, valid_pred_y)
print("Validation Classification Report:\n", report_valid)

test_pred_y = dt_model.predict(test_x_tfidf)

print("Test Accuracy: ", accuracy_score(test_y, test_pred_y))
print("Test F1 Score: ", f1_score(test_y, test_pred_y))
print("Test MCC: ", matthews_corrcoef(test_y, test_pred_y))

cmd_test = ConfusionMatrixDisplay(confusion_matrix(test_y, test_pred_y), display_labels=['Human', 'Generated'])
cmd_test.plot()

report_test = classification_report(test_y, test_pred_y)
print("Test Classification Report:\n", report_test)

joblib.dump(dt_model, 'decision_tree_model_Traditional.joblib')

In [None]:
### Adaboost model

from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
    classification_report,
    f1_score,
    matthews_corrcoef
)
import joblib  

dt_stump = DecisionTreeClassifier(max_depth=1, random_state=42)
ada_model = AdaBoostClassifier(estimator=dt_stump, n_estimators=100, algorithm='SAMME', random_state=42)
ada_model.fit(train_x_tfidf, train_y)

valid_pred_y = ada_model.predict(valid_x_tfidf)

print("Validation Accuracy: ", accuracy_score(valid_y, valid_pred_y))
print("Validation F1 Score: ", f1_score(valid_y, valid_pred_y))
print("Validation MCC: ", matthews_corrcoef(valid_y, valid_pred_y))

cmd_valid = ConfusionMatrixDisplay(confusion_matrix(valid_y, valid_pred_y), display_labels=['Human', 'Generated'])
cmd_valid.plot()

report_valid = classification_report(valid_y, valid_pred_y)
print("Validation Classification Report:\n", report_valid)

test_pred_y = ada_model.predict(test_x_tfidf)

print("Test Accuracy: ", accuracy_score(test_y, test_pred_y))
print("Test F1 Score: ", f1_score(test_y, test_pred_y))
print("Test MCC: ", matthews_corrcoef(test_y, test_pred_y))

cmd_test = ConfusionMatrixDisplay(confusion_matrix(test_y, test_pred_y), display_labels=['Human', 'Generated'])
cmd_test.plot()

report_test = classification_report(test_y, test_pred_y)
print("Test Classification Report:\n", report_test)

joblib.dump(ada_model, 'adaboost_model_Traditional.joblib')

In [None]:
### Bagging Classifier model

from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
    classification_report,
    f1_score,
    matthews_corrcoef
)
import joblib  

base_estimator = DecisionTreeClassifier()
bagging_model = BaggingClassifier(estimator=base_estimator, n_estimators=100, random_state=42)
bagging_model.fit(train_x_tfidf, train_y)

valid_pred_y = bagging_model.predict(valid_x_tfidf)

print("Validation Accuracy: ", accuracy_score(valid_y, valid_pred_y))
print("Validation F1 Score: ", f1_score(valid_y, valid_pred_y))
print("Validation MCC: ", matthews_corrcoef(valid_y, valid_pred_y))

cmd_valid = ConfusionMatrixDisplay(confusion_matrix(valid_y, valid_pred_y), display_labels=['Human', 'Generated'])
cmd_valid.plot()

report_valid = classification_report(valid_y, valid_pred_y)
print("Validation Classification Report:\n", report_valid)

test_pred_y = bagging_model.predict(test_x_tfidf)

print("Test Accuracy: ", accuracy_score(test_y, test_pred_y))
print("Test F1 Score: ", f1_score(test_y, test_pred_y))
print("Test MCC: ", matthews_corrcoef(test_y, test_pred_y))

cmd_test = ConfusionMatrixDisplay(confusion_matrix(test_y, test_pred_y), display_labels=['Human', 'Generated'])
cmd_test.plot()

report_test = classification_report(test_y, test_pred_y)
print("Test Classification Report:\n", report_test)

joblib.dump(bagging_model, 'bagging_model_Traditional.joblib')

In [None]:
### Gradient Boosting Classifier model

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
    classification_report,
    f1_score,
    matthews_corrcoef
)
import joblib

gradient_boosting_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gradient_boosting_model.fit(train_x_tfidf, train_y)

valid_pred_y = gradient_boosting_model.predict(valid_x_tfidf)
print("Validation Accuracy: ", accuracy_score(valid_y, valid_pred_y))
print("Validation F1 Score: ", f1_score(valid_y, valid_pred_y))
print("Validation MCC: ", matthews_corrcoef(valid_y, valid_pred_y))

cmd_valid = ConfusionMatrixDisplay(confusion_matrix(valid_y, valid_pred_y), display_labels=['Human', 'Generated'])
cmd_valid.plot()

report_valid = classification_report(valid_y, valid_pred_y)
print("Validation Classification Report:\n", report_valid)

test_pred_y = gradient_boosting_model.predict(test_x_tfidf)
print("Test Accuracy: ", accuracy_score(test_y, test_pred_y))
print("Test F1 Score: ", f1_score(test_y, test_pred_y))
print("Test MCC: ", matthews_corrcoef(test_y, test_pred_y))

cmd_test = ConfusionMatrixDisplay(confusion_matrix(test_y, test_pred_y), display_labels=['Human', 'Generated'])
cmd_test.plot()

report_test = classification_report(test_y, test_pred_y)
print("Test Classification Report:\n", report_test)

joblib.dump(gradient_boosting_model, 'gradient_boosting_model_Traditional.joblib')

In [None]:
### Random Forest Classifier Model

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
    classification_report,
    f1_score,
    matthews_corrcoef
)
import joblib

rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(train_x_tfidf, train_y)

valid_pred_y = rf_model.predict(valid_x_tfidf)
print("Validation Accuracy: ", accuracy_score(valid_y, valid_pred_y))
print("Validation F1 Score: ", f1_score(valid_y, valid_pred_y))
print("Validation MCC: ", matthews_corrcoef(valid_y, valid_pred_y))

cmd_valid = ConfusionMatrixDisplay(confusion_matrix(valid_y, valid_pred_y), display_labels=['Human', 'Generated'])
cmd_valid.plot()

report_valid = classification_report(valid_y, valid_pred_y)
print("Validation Classification Report:\n", report_valid)

test_pred_y = rf_model.predict(test_x_tfidf)
print("Test Accuracy: ", accuracy_score(test_y, test_pred_y))
print("Test F1 Score: ", f1_score(test_y, test_pred_y))
print("Test MCC: ", matthews_corrcoef(test_y, test_pred_y))

cmd_test = ConfusionMatrixDisplay(confusion_matrix(test_y, test_pred_y), display_labels=['Human', 'Generated'])
cmd_test.plot()

report_test = classification_report(test_y, test_pred_y)
print("Test Classification Report:\n", report_test)

joblib.dump(rf_model, 'random_forest_model_Traditional.joblib')

In [None]:
### Extra Tree Classifier model

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
    classification_report,
    f1_score,
    matthews_corrcoef
)
import joblib

extra_trees_model = ExtraTreesClassifier(n_estimators=100, random_state=42)
extra_trees_model.fit(train_x_tfidf, train_y)

valid_pred_y = extra_trees_model.predict(valid_x_tfidf)
print("Validation Accuracy: ", accuracy_score(valid_y, valid_pred_y))
print("Validation F1 Score: ", f1_score(valid_y, valid_pred_y))
print("Validation MCC: ", matthews_corrcoef(valid_y, valid_pred_y))

cmd_valid = ConfusionMatrixDisplay(confusion_matrix(valid_y, valid_pred_y), display_labels=['Human', 'Generated'])
cmd_valid.plot()

report_valid = classification_report(valid_y, valid_pred_y)
print("Validation Classification Report:\n", report_valid)

test_pred_y = extra_trees_model.predict(test_x_tfidf)
print("Test Accuracy: ", accuracy_score(test_y, test_pred_y))
print("Test F1 Score: ", f1_score(test_y, test_pred_y))
print("Test MCC: ", matthews_corrcoef(test_y, test_pred_y))

cmd_test = ConfusionMatrixDisplay(confusion_matrix(test_y, test_pred_y), display_labels=['Human', 'Generated'])
cmd_test.plot()

report_test = classification_report(test_y, test_pred_y)
print("Test Classification Report:\n", report_test)

joblib.dump(extra_trees_model, 'extra_trees_model_Traditional.joblib')

In [None]:
### Logistic Regression model

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
    classification_report,
    f1_score,
    matthews_corrcoef
)
import joblib

log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(train_x_tfidf, train_y)

valid_pred_y = log_reg.predict(valid_x_tfidf)
print("Validation Accuracy: ", accuracy_score(valid_y, valid_pred_y))
print("Validation F1 Score: ", f1_score(valid_y, valid_pred_y))
print("Validation MCC: ", matthews_corrcoef(valid_y, valid_pred_y))

cmd_valid = ConfusionMatrixDisplay(confusion_matrix(valid_y, valid_pred_y), display_labels=['Human', 'Generated'])
cmd_valid.plot()

report_valid = classification_report(valid_y, valid_pred_y)
print("Validation Classification Report:\n", report_valid)

test_pred_y = log_reg.predict(test_x_tfidf)
print("Test Accuracy: ", accuracy_score(test_y, test_pred_y))
print("Test F1 Score: ", f1_score(test_y, test_pred_y))
print("Test MCC: ", matthews_corrcoef(test_y, test_pred_y))

cmd_test = ConfusionMatrixDisplay(confusion_matrix(test_y, test_pred_y), display_labels=['Human', 'Generated'])
cmd_test.plot()

report_test = classification_report(test_y, test_pred_y)
print("Test Classification Report:\n", report_test)

joblib.dump(log_reg, 'logistic_regression_model_Traditional.joblib')

In [None]:
### Multi Layer Perceptron model

from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
    classification_report,
    f1_score,
    matthews_corrcoef
)
import joblib

mlp_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)
mlp_model.fit(train_x_tfidf, train_y)

valid_pred_y = mlp_model.predict(valid_x_tfidf)
print("Validation Accuracy: ", accuracy_score(valid_y, valid_pred_y))
print("Validation F1 Score: ", f1_score(valid_y, valid_pred_y))
print("Validation MCC: ", matthews_corrcoef(valid_y, valid_pred_y))

cmd_valid = ConfusionMatrixDisplay(confusion_matrix(valid_y, valid_pred_y), display_labels=['Human', 'Generated'])
cmd_valid.plot()

report_valid = classification_report(valid_y, valid_pred_y)
print("Validation Classification Report:\n", report_valid)

test_pred_y = mlp_model.predict(test_x_tfidf)
print("Test Accuracy: ", accuracy_score(test_y, test_pred_y))
print("Test F1 Score: ", f1_score(test_y, test_pred_y))
print("Test MCC: ", matthews_corrcoef(test_y, test_pred_y))

cmd_test = ConfusionMatrixDisplay(confusion_matrix(test_y, test_pred_y), display_labels=['Human', 'Generated'])
cmd_test.plot()

report_test = classification_report(test_y, test_pred_y)
print("Test Classification Report:\n", report_test)

joblib.dump(mlp_model, 'mlp_model_Traditional.joblib')

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
    classification_report,
    f1_score,
    matthews_corrcoef
)
import joblib

lda_model = LatentDirichletAllocation(n_components=10, max_iter=10, random_state=42)
train_x_lda = lda_model.fit_transform(train_x_tfidf)
valid_x_lda = lda_model.transform(valid_x_tfidf)
test_x_lda = lda_model.transform(test_x_tfidf)

valid_pred_y = lda_model.transform(valid_x_lda)
print("Validation Accuracy: ", accuracy_score(valid_y, valid_pred_y))
print("Validation F1 Score: ", f1_score(valid_y, valid_pred_y))
print("Validation MCC: ", matthews_corrcoef(valid_y, valid_pred_y))

cmd_valid = ConfusionMatrixDisplay(confusion_matrix(valid_y, valid_pred_y), display_labels=['Human', 'Generated'])
cmd_valid.plot()

report_valid = classification_report(valid_y, valid_pred_y)
print("Validation Classification Report:\n", report_valid)

test_pred_y = lda_model.transform(test_x_lda)
print("Test Accuracy: ", accuracy_score(test_y, test_pred_y))
print("Test F1 Score: ", f1_score(test_y, test_pred_y))
print("Test MCC: ", matthews_corrcoef(test_y, test_pred_y))

cmd_test = ConfusionMatrixDisplay(confusion_matrix(test_y, test_pred_y), display_labels=['Human', 'Generated'])
cmd_test.plot()

report_test = classification_report(test_y, test_pred_y)
print("Test Classification Report:\n", report_test)

joblib.dump(lda_model, 'lda_model.joblib')

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

def plot_roc_curve(y_true, y_scores, label):
    fpr, tpr, _ = roc_curve(y_true, y_scores)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f'{label} (AUC = {roc_auc:.2f})')

# Plot for Validation ROC Curves
plt.figure(figsize=(12, 8))
plt.title('Validation ROC Curves')
plt.plot([0, 1], [0, 1], 'k--')  # Diagonal line

# CNN Model Validation
plot_roc_curve(valid_y, model_CNN.predict(valid_x_tfidf_reshaped).flatten(), 'CNN Validation')

# LSTM Model Validation
plot_roc_curve(valid_y, model_LSTM.predict(valid_dataset).flatten(), 'LSTM Validation')

# RNN Model Validation
plot_roc_curve(valid_y, model_RNN.predict(valid_dataset).flatten(), 'RNN Validation')

# SVM Model Validation
plot_roc_curve(valid_y, svm_model.predict_proba(valid_x_tfidf)[:, 1], 'SVM Validation')

# KNN Model Validation
plot_roc_curve(valid_y, knn_model.predict_proba(valid_x_tfidf)[:, 1], 'KNN Validation')

# Decision Tree Model Validation
plot_roc_curve(valid_y, dt_model.predict_proba(valid_x_tfidf)[:, 1], 'Decision Tree Validation')

# AdaBoost Model Validation
plot_roc_curve(valid_y, ada_model.predict_proba(valid_x_tfidf)[:, 1], 'AdaBoost Validation')

# Bagging Model Validation
plot_roc_curve(valid_y, bagging_model.predict_proba(valid_x_tfidf)[:, 1], 'Bagging Validation')

# Gradient Boosting Model Validation
plot_roc_curve(valid_y, gb_model.predict_proba(valid_x_tfidf)[:, 1], 'Gradient Boosting Validation')

# Random Forest Model Validation
plot_roc_curve(valid_y, rf_model.predict_proba(valid_x_tfidf)[:, 1], 'Random Forest Validation')

# Extra Trees Model Validation
plot_roc_curve(valid_y, et_model.predict_proba(valid_x_tfidf)[:, 1], 'Extra Trees Validation')

# Logistic Regression Model Validation
plot_roc_curve(valid_y, lr_model.predict_proba(valid_x_tfidf)[:, 1], 'Logistic Regression Validation')

# Multi-Layer Perceptron Model Validation
plot_roc_curve(valid_y, mlp_model.predict(valid_x_tfidf).flatten(), 'MLP Validation')

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc='lower right')
plt.show()

# Plot for Test ROC Curves
plt.figure(figsize=(12, 8))
plt.title('Test ROC Curves')
plt.plot([0, 1], [0, 1], 'k--')  # Diagonal line

# CNN Model Test
plot_roc_curve(test_y, model_CNN.predict(test_x_tfidf_reshaped).flatten(), 'CNN Test')

# LSTM Model Test
plot_roc_curve(test_y, model_LSTM.predict(test_dataset).flatten(), 'LSTM Test')

# RNN Model Test
plot_roc_curve(test_y, model_RNN.predict(test_dataset).flatten(), 'RNN Test')

# SVM Model Test
plot_roc_curve(test_y, svm_model.predict_proba(test_x_tfidf)[:, 1], 'SVM Test')

# KNN Model Test
plot_roc_curve(test_y, knn_model.predict_proba(test_x_tfidf)[:, 1], 'KNN Test')

# Decision Tree Model Test
plot_roc_curve(test_y, dt_model.predict_proba(test_x_tfidf)[:, 1], 'Decision Tree Test')

# AdaBoost Model Test
plot_roc_curve(test_y, ada_model.predict_proba(test_x_tfidf)[:, 1], 'AdaBoost Test')

# Bagging Model Test
plot_roc_curve(test_y, bagging_model.predict_proba(test_x_tfidf)[:, 1], 'Bagging Test')

# Gradient Boosting Model Test
plot_roc_curve(test_y, gb_model.predict_proba(test_x_tfidf)[:, 1], 'Gradient Boosting Test')

# Random Forest Model Test
plot_roc_curve(test_y, rf_model.predict_proba(test_x_tfidf)[:, 1], 'Random Forest Test')

# Extra Trees Model Test
plot_roc_curve(test_y, et_model.predict_proba(test_x_tfidf)[:, 1], 'Extra Trees Test')

# Logistic Regression Model Test
plot_roc_curve(test_y, lr_model.predict_proba(test_x_tfidf)[:, 1], 'Logistic Regression Test')

# Multi-Layer Perceptron Model Test
plot_roc_curve(test_y, mlp_model.predict(test_x_tfidf).flatten(), 'MLP Test')

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc='lower right')
plt.show()

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score

# Assuming y_true contains the true labels for the test set
# Replace this with your actual true labels
y_true = test_y 

# Calculate the accuracies for each model
test_accuracies = [
    accuracy_score(y_true, (test_pred_prob_cnn > 0.5).astype(int)),  # CNN
    accuracy_score(y_true, (test_pred_prob_lstm > 0.5).astype(int)),  # LSTM
    accuracy_score(y_true, (test_pred_prob_rnn > 0.5).astype(int)),  # RNN
    accuracy_score(y_true, (test_pred_prob_svm > 0.5).astype(int)),  # SVM
    accuracy_score(y_true, (test_pred_prob_knn > 0.5).astype(int)),  # KNN
    accuracy_score(y_true, (test_pred_prob_dt > 0.5).astype(int)),  # Decision Tree
    accuracy_score(y_true, (test_pred_prob_ab > 0.5).astype(int)),  # AdaBoost
    accuracy_score(y_true, (test_pred_prob_bagging > 0.5).astype(int)),  # Bagging
    accuracy_score(y_true, (test_pred_prob_gb > 0.5).astype(int)),  # Gradient Boosting
    accuracy_score(y_true, (test_pred_prob_rf > 0.5).astype(int)),  # Random Forest
    accuracy_score(y_true, (test_pred_prob_et > 0.5).astype(int)),  # Extra Trees
    accuracy_score(y_true, (test_pred_prob_lr > 0.5).astype(int)),  # Logistic Regression
    accuracy_score(y_true, (test_pred_prob_mlp > 0.5).astype(int)),  # Multi-Layer Perceptron
]

# Model names for the bar graph
model_names = [
    'CNN', 'LSTM', 'RNN', 'SVM', 'KNN', 'Decision Tree', 
    'AdaBoost', 'Bagging', 'Gradient Boosting', 
    'Random Forest', 'Extra Trees', 'Logistic Regression', 
    'Multi-Layer Perceptron'
]

# Create the bar graph
plt.figure(figsize=(12, 6))
plt.barh(model_names, test_accuracies, color='skyblue')
plt.xlabel('Test Accuracy')
plt.title('Test Accuracies of Different Models')
plt.xlim([0, 1])  # Set x-axis limits to 0 to 1 for accuracy
plt.grid(axis='x')

# Display the accuracy values on the bars
for i, v in enumerate(test_accuracies):
    plt.text(v + 0.01, i, f"{v:.2f}", color='black', va='center')

plt.show()