In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import os

# os.chdir allows you to change directories, like cd in the Terminal
os.chdir('/content/drive/MyDrive/Colab Notebooks/raw_data')

In [3]:
pip install tensorflow_text

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
import pandas as pd
import numpy as np
import os
from pathlib import Path

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text  # Imports TF ops for preprocessing

from tensorflow.keras import layers

from tensorflow.keras import callbacks

In [5]:
path_data = Path('/content/drive/MyDrive/Colab Notebooks/raw_data')

In [6]:
# function to load data depending on size; loads human and AI written text
def load_data(source: str="xl-1542M",
              truncation: bool=True,
              n_rows: int=500_000) -> dict[pd.DataFrame]:
    '''Load the data in dictionary of pandas Dataframes.
    ---
    source: specifies the outputs of a GPT-2 model

    ---
    truncation: specifies if Top-K 40 truncation data is used

    ---
    n_rows: specifies the fraction of data loaded. Smaller values for testing the code.'''
    final_data={}
    for split in ["train", "valid", "test"]:
        data={}
        if truncation:
            file_path = path_data / f"{source}-k40.{split}.csv"
        else:
            file_path = path_data / f"{source}.{split}.csv"
        data['fake'] = pd.read_csv(file_path, usecols=["text"], nrows=n_rows//2) # nrows to have balanced dataset
        data['fake']["AI"] = 1 # AI written

        file_path = path_data / f"webtext.{split}.csv"
        data['true'] = []
        data['true'] = pd.read_csv(file_path, usecols=["text"], nrows=n_rows//2) # nrows to have balanced dataset
        data['true']["AI"] = 0 # not AI written

        final_data[split] = pd.concat([data["true"], data["fake"]])

    return final_data

In [20]:
data_train = load_data(n_rows=100_000)["train"].reset_index(drop=True)


In [21]:
data_val = load_data(n_rows=20_000)["valid"].reset_index(drop=True)
data_test = load_data(n_rows=20_000)["test"].reset_index(drop=True)

In [22]:
X_train=data_train["text"]
y_train=data_train["AI"]

In [23]:
X_val=data_val["text"]
y_val=data_val["AI"]

In [24]:
X_test=data_test["text"]
X_test=data_test["AI"]

In [25]:
val_set=(X_val,y_val)

In [13]:
X_train

0         These girlfriends deserves a special mention f...
1         LeSean McCoy going through warmups with first ...
2         Tom Curran has been called up to England's Ash...
3         We'll have turkey on the table Thursday but, a...
4         The 1945 Sinkings of the Cap Arcona and the Th...
                                ...                        
199995    There are many people who believe in the free ...
199996    The first thing to realize is that, historical...
199997    How Much Money Do You Get on Each Piece of You...
199998    I got the opportunity to take my new iPhone in...
199999    A new study reveals that "the number of black ...
Name: text, Length: 200000, dtype: object

In [14]:
y_train

0         0
1         0
2         0
3         0
4         0
         ..
199995    1
199996    1
199997    1
199998    1
199999    1
Name: AI, Length: 200000, dtype: int64

Vectorizing


In [15]:
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string)
preprocessor = hub.KerasLayer(
    "https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3")
encoder_inputs = preprocessor(text_input)
encoder = hub.KerasLayer(
    "https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/4",
    trainable=False)
outputs = encoder(encoder_inputs)
pooled_output = outputs["pooled_output"]
sequence_output = outputs["sequence_output"]

In [26]:
embedding_model = tf.keras.Model(text_input, pooled_output)

In [27]:
model = tf.keras.Sequential()
model.add(embedding_model)
model.add(tf.keras.layers.Dense(8, activation='relu'))
model.add(tf.keras.layers.Dense(8, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation = "sigmoid"))

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 model_1 (Functional)        (None, 768)               108310273 
                                                                 
 dense_3 (Dense)             (None, 8)                 6152      
                                                                 
 dense_4 (Dense)             (None, 8)                 72        
                                                                 
 dense_5 (Dense)             (None, 1)                 9         
                                                                 
Total params: 108,316,506
Trainable params: 6,233
Non-trainable params: 108,310,273
_________________________________________________________________


In [28]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [30]:
es = callbacks.EarlyStopping(patience=2, restore_best_weights=True)

history = model.fit(X_train, y_train,  batch_size=32 ,
                    epochs=10,
                    validation_data=val_set,
                    callbacks=[es],
                    verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
