In [None]:
!pip install tensorflow_text

Collecting tensorflow_text
  Downloading tensorflow_text-2.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorflow_text
Successfully installed tensorflow_text-2.15.0


In [None]:
!pip install PrettyPrinter

Collecting PrettyPrinter
  Downloading prettyprinter-0.18.0-py2.py3-none-any.whl (48 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/48.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.0/48.0 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Collecting colorful>=0.4.0 (from PrettyPrinter)
  Downloading colorful-0.5.6-py2.py3-none-any.whl (201 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m201.4/201.4 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: colorful, PrettyPrinter
Successfully installed PrettyPrinter-0.18.0 colorful-0.5.6


In [None]:
import os
import random
import numpy as np
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import string
import tensorflow_text as tf_text

In [None]:
import joblib

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pprint
pp = pprint.PrettyPrinter(depth=4)

def get_metrics(model_name, y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = cm[1][1] / (cm[1][1]+cm[0][1])
    recall = cm[1][1] / (cm[1][1]+cm[1][0])
    f1_score = 2*(precision*recall)/(precision+recall)
    print(cm)
    return {"Model": model_name,
            "Accuracy":accuracy,
            "Precision": precision,
            "Recall": recall,
            "F1_score": f1_score}

In [None]:
random_seed= 5

os.environ['PYTHONHASHSEED']=str(random_seed)
random.seed(random_seed)
np.random.seed(random_seed)
tf.random.set_seed(random_seed)

In [None]:
train = pd.read_csv('/content/train_test_dataset.csv')
train.head()

Unnamed: 0,id,prompt,text,generated
0,0,0,Some schools in United States ofter classes fr...,0
1,1,0,"Four-day work week, a remarkable idea to conse...",0
2,2,0,Students and their families should consider an...,0
3,3,0,Agree you will never grow if something beyond ...,0
4,4,0,I think our character traits are formed by inf...,0


In [None]:
train_df, val_df = train_test_split(train, test_size=0.3, random_state=222)
batch_size = 32

raw_train_ds = tf.data.Dataset.from_tensor_slices((train_df['text'].values, train_df['generated'].values)).batch(batch_size)
raw_val_ds = tf.data.Dataset.from_tensor_slices((val_df['text'].values, val_df['generated'].values)).batch(batch_size)

In [None]:
max_features = 75000
embedding_dim = 64
sequence_length = 512*2

def tf_lower_and_split_punct(text):
    text = tf_text.normalize_utf8(text, 'NFKD')
    text = tf.strings.lower(text)
    text = tf.strings.regex_replace(text, '[^ a-z.?!,¿]', '')
    text = tf.strings.regex_replace(text, '[.?!,¿]', r' \0 ')
    text = tf.strings.strip(text)
    text = tf.strings.join(['[START]', text, '[END]'], separator=' ')
    return text


# Text vectorization layer
vectorize_layer = tf.keras.layers.TextVectorization(
    standardize=tf_lower_and_split_punct,
    max_tokens=max_features,
    ngrams = (3,5),
    output_mode="int",
    output_sequence_length=sequence_length,
    pad_to_max_tokens=True
)

text_ds = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(text_ds)

def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)

In [None]:
from tensorflow import keras
from keras.layers import TextVectorization, Embedding, Bidirectional, LSTM, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from keras import Model, Input


class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential(
            [tf.keras.layers.Dense(ff_dim, activation="relu"), tf.keras.layers.Dense(embed_dim),]
        )
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [None]:
inputs = Input(shape=(sequence_length,), dtype="int64")
x = Embedding(max_features, embedding_dim)(inputs)
x = Bidirectional(LSTM(32, return_sequences=True))(x)
transformer_block = TransformerBlock(embedding_dim, 2, 32)
x = transformer_block(x)
x = Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = GlobalMaxPooling1D()(x)
x = Dense(128, activation="relu")(x)
x = Dropout(0.5)(x)
predictions = Dense(1, activation="sigmoid", name="predictions")(x)

model = Model(inputs=inputs, outputs=predictions)
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1024)]            0         
                                                                 
 embedding (Embedding)       (None, 1024, 64)          4800000   
                                                                 
 bidirectional (Bidirection  (None, 1024, 64)          24832     
 al)                                                             
                                                                 
 transformer_block (Transfo  (None, 1024, 64)          37664     
 rmerBlock)                                                      
                                                                 
 conv1d (Conv1D)             (None, 340, 128)          57472     
                                                                 
 global_max_pooling1d (Glob  (None, 128)               0     

In [None]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["binary_accuracy"])

epochs = 2
model.fit(train_ds, validation_data=val_ds, epochs=epochs)

Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x7f773028afe0>

In [None]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1024)]            0         
                                                                 
 embedding (Embedding)       (None, 1024, 64)          4800000   
                                                                 
 bidirectional (Bidirection  (None, 1024, 64)          24832     
 al)                                                             
                                                                 
 transformer_block (Transfo  (None, 1024, 64)          37664     
 rmerBlock)                                                      
                                                                 
 conv1d (Conv1D)             (None, 340, 128)          57472     
                                                                 
 global_max_pooling1d (Glob  (None, 128)               0     

In [None]:
model.save("RNN_Final", save_format="tf")

In [16]:
val_df2 = pd.read_csv('/content/validation_dataset.csv')

In [17]:
test_text = val_df2['text'].values
vectorized_test_text = vectorize_layer(test_text)
predictions = model.predict(vectorized_test_text)
val_df2['%predict'] = predictions
print(val_df2[['id','text', 'generated','%predict']])

          id                                               text  generated  \
0          0  There are alot reasons to keep our the despise...          0   
1          1  Driving smart cars that drive by themself has ...          0   
2          2  Dear Principal,\n\nI believe that students at ...          0   
3          3  Dear Principal,\n\nCommunity service should no...          0   
4          4  My argument for the development of the driverl...          0   
...      ...                                                ...        ...   
33254  33254  Are driverless cars really necessary? Most of ...          0   
33255  33255  "Oh man I didn't make the soccer team!", yelle...          0   
33256  33256  I believe that using this technology could be ...          0   
33257  33257  Texting & Driving\n\nUsing your phone while dr...          0   
33258  33258  Dear Principal,\n\nI have been really good thi...          0   

       %predict  
0      0.000038  
1      0.000352  
2      0.

In [18]:
val_df2[['id','text', 'generated','%predict']].set_index('id').to_csv("result.csv")

In [19]:
y_test = val_df2['generated']
y_pred = round(val_df2['%predict'])
metrics = get_metrics("LSTM+Transformer", y_test, y_pred)
pp.pprint(metrics)

[[25989     7]
 [   17  7246]]
{'Accuracy': 0.9992783908115097,
 'F1_score': 0.9983466519702399,
 'Model': 'LSTM+Transformer',
 'Precision': 0.9990348821177444,
 'Recall': 0.9976593694065813}


In [20]:
input = ["As of my last knowledge update in January 2022, the President of the United States is Joe Biden. He assumed office on January 20, 2021, succeeding Donald Trump. Joe Biden, a seasoned politician with decades of experience, served as the Vice President under President Barack Obama from 2009 to 2017. Born on November 20, 1942, in Scranton, Pennsylvania, Biden has dedicated much of his career to public service, initially as a U.S. Senator from Delaware for 36 years before ascending to the presidency. His presidency is marked by a focus on addressing pressing issues such as the COVID-19 pandemic, climate change, racial justice, and economic recovery. Biden's administration has also emphasized rebuilding international alliances and fostering unity within the nation. It's important to note that developments may have occurred since my last update, so it's advisable to verify the current President of the United States for the latest information."]

In [21]:
vectorized_test_text_1 = vectorize_layer(input)
prediction = model.predict(vectorized_test_text_1)
print(prediction)

[[0.9999998]]


In [23]:
!zip -r /content/file.zip /content/RNN_Final

  adding: content/RNN_Final/ (stored 0%)
  adding: content/RNN_Final/fingerprint.pb (stored 0%)
  adding: content/RNN_Final/variables/ (stored 0%)
  adding: content/RNN_Final/variables/variables.index (deflated 70%)
  adding: content/RNN_Final/variables/variables.data-00000-of-00001 (deflated 5%)
  adding: content/RNN_Final/saved_model.pb (deflated 90%)
  adding: content/RNN_Final/keras_metadata.pb (deflated 91%)
  adding: content/RNN_Final/assets/ (stored 0%)


In [24]:
from google.colab import files
files.download("/content/file.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>