# GloVe EMBEDDING TECHNIQUE

In [1]:
import numpy as np

def load_glove_embeddings(file_path):
    embeddings_index = {}
    with open(file_path, encoding="utf-8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype="float32")
            embeddings_index[word] = coefs
    return embeddings_index

glove_file_path = "glove.6B.100d.txt"  # Adjust the path based on your downloaded file
glove_embeddings = load_glove_embeddings(glove_file_path)


# TOKENIZE TEXT

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
import pandas as pd

dataset = pd.read_csv("AI_Human_Essay.csv")

X_train, X_test, y_train, y_test = train_test_split(
    dataset['text'], dataset['generated'].astype(int), test_size=0.2, random_state=42
)

max_words = 10000  # Choose an appropriate value
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)


# PAD SEQUENCES

In [3]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_sequence_length = 100  # Choose an appropriate value
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_sequence_length)
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_sequence_length)


# CREATE EMBEDDING MATRIX

In [4]:
word_index = tokenizer.word_index
embedding_dim = 100  # Use the same dimension as your GloVe file
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    if i < max_words:
        embedding_vector = glove_embeddings.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

# BUILD THE MODEL

In [5]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, BatchNormalization, Dropout
from tensorflow.keras.regularizers import l2

model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=max_sequence_length, weights=[embedding_matrix], trainable=False))
model.add(LSTM(256, return_sequences=True, kernel_regularizer=l2(0.001), recurrent_regularizer=l2(0.001)))  
model.add(Dropout(0.5))  # Increased dropout rate
model.add(LSTM(128, kernel_regularizer=l2(0.001), recurrent_regularizer=l2(0.001)))
model.add(Dropout(0.5))  # Increased dropout rate
model.add(Dense(64, activation="relu"))
model.add(BatchNormalization())
model.add(Dense(1, activation="sigmoid"))

# Compile the model
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Train the model with more epochs
model.fit(X_train_padded, y_train, epochs=3, validation_data=(X_test_padded, y_test))




Epoch 1/3
[1m12181/12181[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5317s[0m 436ms/step - accuracy: 0.8920 - loss: 0.3980 - val_accuracy: 0.9496 - val_loss: 0.1888
Epoch 2/3
[1m12181/12181[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4890s[0m 397ms/step - accuracy: 0.9500 - loss: 0.1801 - val_accuracy: 0.9626 - val_loss: 0.1449
Epoch 3/3
[1m12181/12181[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4077s[0m 335ms/step - accuracy: 0.9539 - loss: 0.1708 - val_accuracy: 0.9640 - val_loss: 0.1456


<keras.src.callbacks.history.History at 0x14d81da2990>

In [6]:
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

# Make predictions on test data
y_pred_probs = model.predict(X_test_padded)
y_pred = np.round(y_pred_probs).astype(int)

# Convert probabilities to binary class labels using a threshold (e.g., 0.5)
# y_pred = (y_pred_probs > 0.5).astype(int)

# Compute classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Compute confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


[1m3046/3046[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m341s[0m 112ms/step
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.98      0.97     61112
           1       0.96      0.94      0.95     36335

    accuracy                           0.96     97447
   macro avg       0.96      0.96      0.96     97447
weighted avg       0.96      0.96      0.96     97447

Confusion Matrix:
[[59679  1433]
 [ 2071 34264]]


# Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      3539
           1       0.98      0.96      0.97      2290

    accuracy                           0.98      5829
   macro avg       0.98      0.97      0.98      5829
weighted avg       0.98      0.98      0.98      5829

# Confusion Matrix:
[[3496   43]
 [  91 2199]] 

In [7]:
# Perform predictions
y_pred_prob = model.predict(X_test_padded)
y_pred = (y_pred_prob > 0.5).astype('int32')

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


[1m3046/3046[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m336s[0m 110ms/step
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.98      0.97     61112
           1       0.96      0.94      0.95     36335

    accuracy                           0.96     97447
   macro avg       0.96      0.96      0.96     97447
weighted avg       0.96      0.96      0.96     97447

Confusion Matrix:
[[59679  1433]
 [ 2071 34264]]


In [8]:
def predict_text_origin(model, tokenizer, max_sequence_length):
    # Prompt user to enter text
    print("Enter the text to classify:")
    input_text = input()

    # Tokenize and pad the input text
    input_sequence = tokenizer.texts_to_sequences([input_text])
    input_padded = pad_sequences(input_sequence, maxlen=max_sequence_length)

    # Predict the class probability
    prediction = model.predict(input_padded)

    # Determine the predicted class label
    predicted_label = "AI-generated" if prediction[0] >= 0.5 else "Human-generated"

    print( predicted_label)

# Example usage:
predicted_class = predict_text_origin(model, tokenizer, max_sequence_length)



Enter the text to classify:
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 110ms/step
AI-generated


In [10]:
# # Save the weights of the model
# model.save_weights('model_updated_weight_ai.weights.h5')

# Save the weights of the model
model.save_weights('models/model_updated_weight_ai.weights.h5')

