In [None]:
pip install --upgrade tensorflow

In [None]:
pip install tensorflow_text==2.15

In [None]:
import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Input, Dense, Dropout, Lambda, Subtract, LSTM, Embedding, Bidirectional,Flatten,GlobalAveragePooling1D
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.initializers import Constant
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras import regularizers
from keras.layers import Dense, LSTM, Dropout, GRU, Bidirectional
from keras.optimizers import SGD

In [None]:
import pandas as pd
import numpy as np
import nltk
import string

In [None]:
import tensorflow as tf
import tensorflow_text as text

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
#Use the counterfactual generated file name here
data = pd.read_csv('SourceToTarget.csv')

In [None]:
# Count the number of rows in the DataFrame object using the built-in len() function
num_lines = len(data)

# Print the number of lines in the CSV file
print("Number of lines in the CSV file: ", num_lines)

In [None]:
#Use the file whose labels have to be predicted
df = pd.read_csv('TargetReviewFile.csv')

In [None]:
data.head()

In [None]:
#If the column names are not 'Label' and 'Review'

old_label_name = 'label'
new_label_name = 'Label'
# Change the column name
data.rename(columns={old_label_name: new_label_name}, inplace=True)

old_review_name = 'generated_text'
new_review_name = 'Review'
# Change the column name
data.rename(columns={old_review_name: new_review_name}, inplace=True)

In [None]:
#Only necessary if values of Label are not 0 and 1
test_data = pd.concat([df[df['Label'] == label].sample(100, random_state=42) for label in [-1, 1]]).sample(frac=1, random_state=42).reset_index(drop=True)
test_data['Label'] = test_data['Label'].replace(-1,0)
data['Label'] = data['Label'].replace(-1,0)

In [None]:
# Count the number of rows in the DataFrame object using the built-in len() function
num_lines = len(data)

# Print the number of lines in the CSV file
print("Number of lines in the CSV file: ", num_lines)

In [None]:
# Count the number of rows in the DataFrame object using the built-in len() function
num_lines = len(test_data)

# Print the number of lines in the CSV file
print("Number of lines in the CSV file: ", num_lines)

In [None]:
def preprocess_text(text):
    # Convert text to lowercase
    text = str(text)
    text = text.lower()

    # Tokenize text into words
    words = nltk.word_tokenize(text)

    # Remove stopwords
    stop_words = set(nltk.corpus.stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Remove punctuation
    words = [word for word in words if word not in string.punctuation]

    # Join the words back into a single string
    processed_text = ' '.join(words)

    return processed_text

In [None]:
data['Review'] = data['Review'].apply(preprocess_text)
test_data['Review'] = test_data['Review'].apply(preprocess_text)

In [None]:
x_train = data['Review']
y_train = data['Label']

In [None]:
x_test = test_data['Review']
y_test = test_data['Label']

**Embedding and Training**

In [None]:
pip install tensorflow_hub

In [None]:
import tensorflow_hub as hub
#initialize tensorflow hub layers
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

In [None]:
#function to build the model
def build_model():
  text_input = Input(shape=(), dtype=tf.string, name='text')
  preprocessed_text = bert_preprocess(text_input)
  outputs = bert_encoder(preprocessed_text)
  l = tf.keras.layers.Reshape((1,768))(outputs['pooled_output'])
  l = Bidirectional(LSTM(128,return_sequences = False))(l)
  l = Dropout(0.2,name="dropout")(l)
  l = Dense(1, activation='sigmoid',name='output')(l)
  model = Model(inputs=[text_input], outputs = [l])
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['Precision','accuracy','Recall'])
  return model

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report
#Define k-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold_no = 1
accuracies, precisions, recalls = [],[],[]

# Loop through each fold
for train_index, val_index in kf.split(x_train):
    print(f"\nTraining on Fold {fold_no}...")

    # Split the data
    x_train_fold, x_val_fold = x_train[train_index], x_train[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

    # Build and train the model
    model = build_model()
    history = model.fit(x_train_fold, y_train_fold,
                        epochs=10,  # Use fewer epochs for demonstration
                        verbose=2,
                        validation_data=(x_val_fold, y_val_fold),
                        batch_size=256)

    # Evaluate on the validation set
    y_val_pred = model.predict(x_val_fold)
    y_val_pred_final = [0 if i < 0.6 else 1 for i in y_val_pred]

    accuracy = accuracy_score(y_val_fold, y_val_pred_final)
    precision = precision_score(y_val_fold, y_val_pred_final)
    recall = recall_score(y_val_fold, y_val_pred_final)

    print(f"Fold {fold_no} Results - Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}")

    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)

    fold_no += 1

In [None]:
# Calculate mean and mean deviation
def mean_deviation(values):
    mean_value = np.mean(values)
    deviations = [abs(x - mean_value) for x in values]
    return mean_value, np.mean(deviations)

accuracy_mean, accuracy_dev = mean_deviation(accuracies)
precision_mean, precision_dev = mean_deviation(precisions)
recall_mean, recall_dev = mean_deviation(recalls)

# Display average metrics and mean deviation
print("\nAverage Results across 5 folds:")
print(f"Accuracy: {accuracy_mean:.4f} ± {accuracy_dev:.4f}")
print(f"Precision: {precision_mean:.4f} ± {precision_dev:.4f}")
print(f"Recall: {recall_mean:.4f} ± {recall_dev:.4f}")