In [1]:
import pandas as pd
import sklearn as sk
import sklearn.model_selection

In [2]:
# load data
data = pd.read_csv('../_data/AI_Human.csv')

data

Unnamed: 0,text,generated
0,Cars. Cars have been around since they became ...,0.0
1,Transportation is a large necessity in most co...,0.0
2,"""America's love affair with it's vehicles seem...",0.0
3,How often do you ride in a car? Do you drive a...,0.0
4,Cars are a wonderful thing. They are perhaps o...,0.0
...,...,...
487230,Tie Face on Mars is really just a big misunder...,0.0
487231,The whole purpose of democracy is to create a ...,0.0
487232,I firmly believe that governments worldwide sh...,1.0
487233,I DFN't agree with this decision because a LFT...,0.0


In [3]:
temp = data

import tensorflow as tf

X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(temp['text'], temp['generated'], test_size=0.2)

2024-03-18 15:02:34.051966: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-18 15:02:34.097892: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
import multiprocessing as mp
from functools import partial

# Define a function to tokenize and pad the text data
def tokenize_and_pad(texts, tokenizer, padding='post'):
    sequences = tokenizer.texts_to_sequences(texts)
    padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, padding=padding)
    return padded_sequences

# Tokenize the words from scikit-learn train test split data
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(X_train)

# Create a pool of worker processes
pool = mp.Pool(processes=mp.cpu_count())

# Tokenize and pad the training data in parallel
tokenize_and_pad_partial = partial(tokenize_and_pad, tokenizer=tokenizer, padding='post')
X_train = pool.map(tokenize_and_pad_partial, [X_train])
X_train = X_train[0]  # Get the result from the map object

# Tokenize and pad the test data in parallel
X_test = pool.map(tokenize_and_pad_partial, [X_test])
X_test = X_test[0]  # Get the result from the map object

# Close the pool and wait for the tasks to finish
pool.close()
pool.join()

In [None]:
# Create the model RNN model
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(len(tokenizer.word_index)+1, 64),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()



In [None]:
# Make a graph of the model
tf.keras.utils.plot_model(model, show_shapes=True)

In [None]:
# Train the model
history = model.fit(X_train, y_train, epochs=1, validation_data=(X_test, y_test), verbose=1)

[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m303s[0m 1s/step - accuracy: 0.9966 - loss: 0.0146 - val_accuracy: 0.9980 - val_loss: 0.0039


In [None]:
# Evaluate the model
model.evaluate(X_test, y_test)

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 249ms/step - accuracy: 0.9990 - loss: 0.0028


[0.0038892326410859823, 0.9980000257492065]

In [None]:
# Save the model with current date and time in model folder
import datetime
model.save(f'_models/{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")}.keras')