## TextVectorization in the model itself

In [1]:
# Import libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from keras.layers import Dense, Embedding, Conv1D, GlobalMaxPooling1D, Dropout, MultiHeadAttention, LayerNormalization, Input
from keras.layers import TextVectorization
from keras.models import Sequential
import numpy as np
import pandas as pd
import os
from sklearn.feature_extraction.text import TfidfTransformer,TfidfVectorizer,CountVectorizer
#from keras_nlp.layers import TokenAndPositionEmbedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from sklearn.metrics import roc_curve,roc_auc_score,accuracy_score
from keras import Model
from tensorflow.keras.layers import Bidirectional, LSTM, GRU, SpatialDropout1D
from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_punctuation, strip_numeric, remove_stopwords, strip_short, stem_text
import gc

In [2]:
#reading the input files
train = pd.read_csv('data/train_essays.csv')
additional_essays = pd.read_csv('data/additional_generated_essays.csv')
kaggle_essays = pd.read_csv('data/ai_generated_train_essays.csv')
kaggle_essays_gpt = pd.read_csv('data/ai_generated_train_essays_gpt-4.csv')
prompt_id = pd.read_csv('data/train_prompts.csv')
test_df = pd.read_csv('data/test_essays.csv')
df_zach = pd.read_csv('data/zach_dataset.csv')

In [3]:
#merging the datasets for training
train_c_n = pd.concat([train.drop(['id'], axis=1), additional_essays
                       ,kaggle_essays.drop(['id'], axis=1)
                       ,kaggle_essays_gpt.drop(['id'], axis=1), df_zach], ignore_index=True)
#train_c_n

In [4]:
del df_zach

In [5]:
# origal training data plus additional data for training
train_data = train_c_n['text']
train_labels = train_c_n['generated']

In [6]:
# Define hyperparameters
vocab_size = 30000 # Only consider the top 20k words
maxlen = 1024 # Only consider the first 200 words of each movie review
embed_dim = 64 # Embedding size for each token
num_heads = 4 # Number of attention heads
ff_dim = 32 # Hidden layer size in feed forward network inside transformer

In [7]:
def clean_text(text):
    x=remove_stopwords(text)
    x=strip_punctuation(x)
    x=strip_numeric(x)
    return x

In [8]:
%%time
train_data = train_data.apply(clean_text)

CPU times: total: 2min 2s
Wall time: 2min 38s


In [9]:
vectorize_layer = TextVectorization(output_mode='int', output_sequence_length=maxlen, max_tokens=vocab_size, ngrams=(2,3,4,5))

In [10]:
X_train, X_val, y_train, y_val = train_test_split(train_data, train_labels, test_size=0.9)

In [11]:
#train_data_tf = tf.data.Dataset.from_tensor_slices(X_train.values)

In [12]:
%%time
#vectorize_layer.adapt(train_data_tf.batch(512))
vectorize_layer.adapt(X_train)

CPU times: total: 4min 21s
Wall time: 6min 43s


In [13]:
# Split the dataset into training and testing sets
X_train, X_val, y_train, y_val = train_test_split(train_data, train_labels, test_size=0.1)

In [14]:
del train_data, train_c_n

In [15]:
gc.collect()

2071

In [16]:
# Define TransformerEncoder layer
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerEncoder, self).__init__()

        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )

        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)

        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)

        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)

        return out2

In [17]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions
#        return x


In [18]:
gc.collect()

0

In [19]:
# Define model
text_input = Input(shape=(1,), dtype=tf.string, name='text')
#embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
#x = Embedding(X_p.shape[1], embed_dim, input_length=maxlen)(inputs)
x = vectorize_layer(text_input)
x = Embedding(vocab_size+1, embed_dim, mask_zero=True)(x)
x = SpatialDropout1D(0.2)(x)
transformer_block = TransformerEncoder(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
#x = Bidirectional(LSTM(64, return_sequences=True))(x)
x = Bidirectional(GRU(64))(x)
#x = Bidirectional(LSTM(32, dropout=0.2))(x)
x = Dropout(0.1)(x)
#x = GlobalMaxPooling1D()(x)
outputs = Dense(2, activation="softmax")(x)

model = Model(inputs=text_input, outputs=outputs)

In [20]:
# Compile the model
model.compile(optimizer=Adam(learning_rate=1e-3), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text (InputLayer)            [(None, 1)]               0         
_________________________________________________________________
text_vectorization (TextVect (None, 1024)              0         
_________________________________________________________________
embedding (Embedding)        (None, 1024, 64)          1920064   
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 1024, 64)          0         
_________________________________________________________________
transformer_encoder (Transfo (None, 1024, 64)          70816     
_________________________________________________________________
bidirectional (Bidirectional (None, 128)               49920     
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0     

In [21]:
# Early stopping to prevent overfitting
#early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [22]:
checkpoint_filepath = 'tmp/chkpt'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='accuracy',
    mode='max',
    save_best_only=True,
)

In [23]:
# Train the model
history = model.fit(X_train, y_train, epochs=2, validation_data=(X_val, y_val), callbacks=[model_checkpoint_callback])

Epoch 1/2
Epoch 2/2


In [24]:
y_pred = model.predict(X_val)

In [25]:
auc_score = roc_auc_score(y_val, y_pred[:,1])
auc_score

0.9751326028038397

In [26]:
X_test_sub = test_df['text']
X_test_sub = X_test_sub.apply(clean_text)
proba_ = model.predict(X_test_sub)
test_df['generated']=proba_[:,1]
test_df_submission = test_df.drop(['prompt_id','text'], axis=1).copy()
test_df_submission.to_csv('submission.csv', index=False)

In [27]:
test_df_submission

Unnamed: 0,id,generated
0,0000aaaa,0.597391
1,1111bbbb,0.597391
2,2222cccc,0.597391
