In [None]:
!pip install googletrans==4.0.0-rc1
import joblib
import asyncio
import random
import tensorflow as tf
import pandas as pd
import numpy as np
import requests
import os
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras import regularizers
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.datasets import fetch_20newsgroups
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import regularizers  # Add this import
from nltk.corpus import wordnet
from googletrans import Translator
from concurrent.futures import ThreadPoolExecutor, as_completed
from functools import lru_cache

print("Is GPU available:", tf.test.is_gpu_available())

# **Load data set**

In [None]:
# x_train, y_train, x_test, y_test = joblib.load('/kaggle/input/ai6103-bc/bert_train_test_data.pkl')  #use embedded data

# Split the training set further into a new training set and a validation set
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
x = newsgroups.data  # Text data
y = newsgroups.target  # Labels

# Split the data into training and testing sets (80% train, 20% test)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Further split the training data into training and validation sets (80% train, 20% validation)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

# Print sizes of the datasets
print(f"Size of training set: {len(x_train)} samples")
print(f"Size of validation set: {len(x_val)} samples")
print(f"Size of testing set: {len(x_test)} samples")

tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train)

# Define vocab_size as the total number of unique words
vocab_size = len(tokenizer.word_index) + 1  # +1 for padding or unknown token

print("Vocabulary size:", vocab_size)


# **Build a base model**

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Embedding, BatchNormalization

# Set up early stopping
early_stopping = EarlyStopping(
    monitor='val_accuracy',     # Monitor the validation loss
    patience=20,             # Number of epochs to wait for improvement
    restore_best_weights=True  # Restore the weights of the best epoch if no improvement
)

max_sequence_length, embedding_dim = 128, 768

##model without augmentation
# def create_model(dropout_rate=0.0, use_batch_norm=False, use_regularization=False, l1=0.01, l2=0.01):
#     model = Sequential()
    
#     if use_regularization:
#         model.add(LSTM(128, 
#                        input_shape=(max_sequence_length, embedding_dim), 
#                        return_sequences=False,
#                        kernel_regularizer=regularizers.l1_l2(l1=l1, l2=l2)))  # Apply L1 and L2 regularization
#     else:
#         model.add(LSTM(128, input_shape=(max_sequence_length, embedding_dim), return_sequences=False))

#     if use_batch_norm:
#         model.add(BatchNormalization())  # Batch norm after LSTM
   
#     # Dropout after LSTM
#     if dropout_rate > 0.0:
#         model.add(Dropout(dropout_rate))
        
#     if use_regularization:
#         model.add(Dense(64, kernel_regularizer=regularizers.l1_l2(l1=l1, l2=l2)))  # Apply Elastic Net regularization
#     else:
#         model.add(Dense(64))  # Fully connected layer

#     if use_batch_norm:
#         model.add(BatchNormalization())  # Batch norm after LSTM

#     # Dropout after first Dense layer
#     if dropout_rate > 0.0:
#         model.add(Dropout(dropout_rate))
        
#     if use_regularization:
#         model.add(Dense(64, activation='relu', kernel_regularizer=regularizers.l1_l2(l1=l1, l2=l2)))  # Apply Elastic Net regularization
#     else:
#         model.add(Dense(64, activation='relu'))  # ReLU function
    
#     # Dropout after second Dense layer
#     if dropout_rate > 0.0:
#         model.add(Dropout(dropout_rate))
        
#     num_classes = len(np.unique(y_train))
#     model.add(Dense(num_classes, activation='softmax'))

#     model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
#     return model


def create_model(dropout_rate=0.0, use_batch_norm=False, use_regularization=False, l1=0.01, l2=0.01):
    input_ids = Input(shape=(max_sequence_length,), name='input_ids')  # Corrected shape
    attention_mask = Input(shape=(max_sequence_length,), name='attention_mask')  # Only if you're using attention_mask

     # Add an embedding layer
    x = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_sequence_length)(input_ids)

    x = LSTM(128, return_sequences=False, name='lstm_layer')(x)

    if use_batch_norm:
        x = BatchNormalization()(x)  # Batch norm after LSTM
    
    if dropout_rate > 0.0:
        x = Dropout(dropout_rate)(x)

    if use_regularization:
        x = Dense(64, kernel_regularizer=regularizers.l1_l2(l1=l1, l2=l2))(x)  # Regularization
    else:
        x = Dense(64)(x)

    if use_batch_norm:
        x = BatchNormalization()(x)  # Batch norm after Dense layer
    
    if dropout_rate > 0.0:
        x = Dropout(dropout_rate)(x)
    
    x = Dense(64, activation='relu')(x)
    
    if dropout_rate > 0.0:
        x = Dropout(dropout_rate)(x)
    
    num_classes = len(np.unique(y_train))  # Ensure y_train is defined
    output = Dense(num_classes, activation='softmax')(x)

    model = Model(inputs=[input_ids, attention_mask], outputs=output)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model

# **Augment Data**

In [None]:
import nest_asyncio
# Apply the workaround for nested event loops
nest_asyncio.apply()

# Function to get synonyms for a given word
def get_synonyms(word):
    response = requests.get(f'https://api.datamuse.com/words?rel_syn={word}')
    return [item['word'] for item in response.json()]

# Initialize the translator
translator = Translator()

async def translate_sentence(sentence, srcl, destl):
    try:
        result = translator.translate(sentence, src=srcl, dest=destl)
        # Check if result is valid and contains text
        if result and result.text:
            return result.text
        else:
            return sentence  # Fallback to the original sentence
    except Exception as e:
        return sentence  # Fallback in case of an error
        
async def back_translate_async(sentences, language):
    print(len(sentences))
    translated_texts = []
    tasks = [translate_sentence(sentence, 'en', language) for sentence in sentences]
    
    translations = await asyncio.gather(*tasks)
    
    # Translate back to English
    back_tasks = [translate_sentence(t, language, 'en') for t in translations]
    back_translations = await asyncio.gather(*back_tasks)
    
    return back_translations
    
def random_word_insertion_deletion(sentence):
    words = sentence.split()
    # Ensure that we have words to work with
    if len(words) == 0:
        return sentence  # Return the original sentence if it is empty
  
    if random.random() > 0.5 and len(words) > 1:
        words.pop(random.randint(0, len(words) - 1))  # Word deletion
    else:
        random_word = random.choice(words)
        synonyms = get_synonyms(random_word)
        if synonyms:
            new_word = random.choice(synonyms)
            words.insert(random.randint(0, len(words)), new_word)
    return ' '.join(words)
    

# Asynchronous function to augment text data with back translation and insertion/deletion
async def augment_text_data(x_train, y_train, language='en', 
                            back_translate_ratio=0.5, insertion_deletion_ratio=0.5):
    augmented_x_train = []
    augmented_y_train = []
    
    # Calculate the number of samples for each augmentation
    num_back_translate = int(len(x_train) * back_translate_ratio)
    num_insertion_deletion =  int(len(x_train) * insertion_deletion_ratio)
    
    # Sample indices for back translation
    back_translate_indices = random.sample(range(len(x_train)), num_back_translate)
    back_translate_sentences = [x_train[i] for i in back_translate_indices]
    
    # 1. Perform back translation asynchronously
    if back_translate_ratio > 0:
        print('Starting async back translation...')
        back_translated_sentences = await back_translate_async(back_translate_sentences, language)
        print('Back translation completed.')
        
        # Add back translated sentences to augmented data with corresponding labels
        augmented_x_train.extend(back_translated_sentences)
        augmented_y_train.extend([y_train[i] for i in back_translate_indices])

    # 2. Perform random word insertion/deletion for remaining sentences
    remaining_indices = [i for i in range(len(x_train)) if i not in back_translate_indices]
    
    print('Starting random word insertion deletion...')
    modified_sentences = [
        random_word_insertion_deletion(x_train[i]) if random.random() < insertion_deletion_ratio else x_train[i]
        for i in remaining_indices
    ]
    print('Random word insertion deletion completed.')

    
    # Add modified sentences to the augmented data with original labels
    augmented_x_train.extend(modified_sentences)
    augmented_y_train.extend([y_train[i] for i in remaining_indices])

    return augmented_x_train, augmented_y_train


# Run the async function within the event loop# Run the augment_data function asynchronously within the event loop
augmented_x_train, augmented_y_train = await augment_text_data(x_train, y_train, 'fr')

x_train_augmented = list(x_train) + augmented_x_train
y_train_augmented = list(y_train) + augmented_y_train


# Final check for equality
print(f"Final X Training samples: {len(x_train_augmented)}")
print(f"Final Y Training samples: {len(y_train_augmented)}")

print(f'Augmented X Training samples: {len(x_train_augmented)}')
print(f'Augmented Y Training samples: {len(y_train_augmented)}')

# Step 3: Tokenize the text data using BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenizing combined training data
tokens = tokenizer(x_train_augmented, padding=True, truncation=True, return_tensors='pt', max_length=128)
# Tokenizing validation and test data
tokens_val = tokenizer(x_val, padding=True, truncation=True, return_tensors='pt', max_length=128)
tokens_test = tokenizer(x_test, padding=True, truncation=True, return_tensors='pt', max_length=128)

# Extracting input IDs and attention masks
input_ids = tokens['input_ids'].numpy()
attention_mask = tokens['attention_mask'].numpy()

input_ids_val = tokens_val['input_ids'].numpy()
attention_mask_val = tokens_val['attention_mask'].numpy()


# **Train models**

In [None]:

def train_and_evaluate(dropout_rate, y_train_augmented, use_batch_norm=False, use_regularization=False, l1=0.01, l2=0.01):
    model = create_model(dropout_rate, use_batch_norm, use_regularization, l1, l2)
    
    # Convert y_train_augmented to numpy array if it's not already
    y_train_augmented = np.array(y_train_augmented)
    
    # Debug output to check shapes
    print(f'y_train_augmented shape: {y_train_augmented.shape}')  
    print(f'Input IDs shape: {input_ids.shape}')                  
    print(f'Attention Mask shape: {attention_mask.shape}')        
    print(f'Validation Input IDs shape: {input_ids_val.shape}')   
    print(f'Validation Attention Mask shape: {attention_mask_val.shape}')  
    print(f'Validation labels shape: {y_val.shape}')              

    history = model.fit(
        x={'input_ids': input_ids, 'attention_mask': attention_mask},
        y=y_train_augmented,
        epochs=150,
        batch_size=32,
        validation_data=({'input_ids': input_ids_val, 'attention_mask': attention_mask_val}, y_val),
        callbacks=[early_stopping],
        verbose=1
    )
    return model, history
    


# **Explore parameters**

In [None]:
# # no batch norm
# model0, history0 = train_and_evaluate(0, False, False, False)
# #with batch norm
# model1, history1 = train_and_evaluate(0, True, False, False)

# #with regularization - elastic net
# l1_values = [0, 1e-4, 1e-3, 1e-2]  # List of L1 values to test
# l2_values = [0, 1e-4, 1e-3, 1e-2]  # List of L2 values to test
    
# results = []  # Store results for each combination
    
# for l1 in l1_values:
#     for l2 in l2_values:
#         print(f"Training with L1: {l1}, L2: {l2}")
#         model, history = train_and_evaluate(
#                 0.0,b
#                 use_batch_norm=False,
#                 use_regularization=True,  # Enable regularization
#                 use_augmentation=False,
#                 l1=l1,
#                 l2=l2
#             )
            
#         # Evaluate the model on validation set
#         test_loss, test_accuracy = model.evaluate(x_test, y_test, verbose=0)
#         results.append((l1, l2, test_loss, test_accuracy, history))
#         print(f"L1: {l1}, L2: {l2}, Testing Loss: {test_loss}, Testing Accuracy: {test_accuracy}")

#with dropout rate
# results = []  # Store results for each combination
# dropout_rates = [0.0, 0.1, 0.2, 0.3, 0.5]
# for dropout_rate in dropout_rates:
#     print(f"Training with Dropout Rate: {dropout_rate}")
#     model, history = train_and_evaluate(
#         dropout_rate=dropout_rate,
#         use_batch_norm=True,
#         use_regularization=False,  # Enable regularization if desired
#         use_augmentation=False,
#         l1=0.0,   # You can fix L1 and L2 values here, e.g., to the best found previously
#         l2=0.0 # Adjust as necessary
#     )
    
#     test_loss, test_accuracy = model.evaluate(x_test, y_test, verbose=0)
#     results.append((dropout_rate, test_loss, test_accuracy, history))
#     print(f"Dropout Rate: {dropout_rate}, Testing Loss: {test_loss}, Testing Accuracy: {test_accuracy}")

# # Convert results to a structured format for easy plotting
# dropout_rates = [result[0] for result in results]
# test_losses = [result[1] for result in results]
# test_accuracies = [result[2] for result in results]

 
model, history = train_and_evaluate(
        dropout_rate=0.3,
        y_train_augmented=y_train_augmented, 
        use_batch_norm=True,
        use_regularization=False,  # Enable regularization if desired
        l1=0,   
        l2=0
)
    

In [None]:
import matplotlib.pyplot as plt

# # # Plotting the results
# plt.figure(figsize=(18, 12))

# # Plotting Training Loss
# plt.subplot(2, 2, 1)
# for result in results:
#     _, _, _, _, history = result
#     plt.plot(history.history['loss'], label=f'Train Loss (L1: {result[0]}, L2: {result[1]})')
        
# plt.title('Training Loss for Different L1 and L2 Values')
# plt.xlabel('Epochs')
# plt.ylabel('Loss')
# plt.legend()
# plt.grid(True)

#  # Plotting Validation Loss
# plt.subplot(2, 2, 2)
# for result in results:
#     _, _, _, _, history = result
#     plt.plot(history.history['val_loss'], label=f'Val Loss (L1: {result[0]}, L2: {result[1]})')
        
# plt.title('Validation Loss for Different L1 and L2 Values')
# plt.xlabel('Epochs')
# plt.ylabel('Loss')
# plt.legend()
# plt.grid(True)

# # Plotting Training Accuracy
# plt.subplot(2, 2, 3)
# for result in results:
#     _, _, _, _, history = result
#     plt.plot(history.history['accuracy'], label=f'Train Accuracy (L1: {result[0]}, L2: {result[1]})')
        
# plt.title('Training Accuracy for Different L1 and L2 Values')
# plt.xlabel('Epochs')
# plt.ylabel('Accuracy')
# plt.legend()
# plt.grid(True)

# # Plotting Validation Accuracy
# plt.subplot(2, 2, 4)
# for result in results:
#     _, _, _, _, history = result
#     plt.plot(history.history['val_accuracy'], label=f'Val Accuracy (L1: {result[0]}, L2: {result[1]})')
        
# plt.title('Validation Accuracy for Different L1 and L2 Values')
# plt.xlabel('Epochs')
# plt.ylabel('Accuracy')
# plt.legend()
# plt.grid(True)

# # Overall layout adjustments
# plt.tight_layout()
# plt.show()
# Prepare to plot the metrics
# plt.figure(figsize=(18, 12))

# # Plotting training loss
# plt.subplot(2, 2, 1)
# for result in results:
#     history = result[3]
#     plt.plot(history.history['loss'], label=f'Dropout {result[0]}')
# plt.title('Training Loss')
# plt.xlabel('Epochs')
# plt.ylabel('Loss')
# plt.legend()
# plt.grid()

# # Plotting training accuracy
# plt.subplot(2, 2, 2)
# for result in results:
#     history = result[3]
#     plt.plot(history.history['accuracy'], label=f'Dropout {result[0]}')
# plt.title('Training Accuracy')
# plt.xlabel('Epochs')
# plt.ylabel('Accuracy')
# plt.legend()
# plt.grid()

# # Plotting validation loss
# plt.subplot(2, 2, 3)
# for result in results:
#     history = result[3]
#     plt.plot(history.history['val_loss'], label=f'Dropout {result[0]}')
# plt.title('Validation Loss')
# plt.xlabel('Epochs')
# plt.ylabel('Loss')
# plt.legend()
# plt.grid()

# # Plotting validation accuracy
# plt.subplot(2, 2, 4)
# for result in results:
#     history = result[3]
#     plt.plot(history.history['val_accuracy'], label=f'Dropout {result[0]}')
# plt.title('Validation Accuracy')
# plt.xlabel('Epochs')
# plt.ylabel('Accuracy')
# plt.legend()
# plt.grid()

# # Show the plots
# plt.tight_layout()
# plt.show()

# Extract values from the history object 
# Plot training & validation accuracy values
plt.figure(figsize=(14, 5))
# Accuracy plot
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(loc='best')

# Loss plot
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(loc='best')

plt.tight_layout()
plt.show()

# **Evaluate the model**

In [None]:
# # Evaluate the model
# test_loss0, test_accuracy0 = model0.evaluate(x_test, y_test)
# test_loss1, test_accuracy1 = model1.evaluate(x_test, y_test)
# # test_loss2, test_accuracy2 = model2.evaluate(x_test, y_test)
# # test_loss3, test_accuracy3 = model3.evaluate(x_test, y_test)
# # test_loss4, test_accuracy4 = model4.evaluate(x_test, y_test)

# print(f"Test Loss without Batch Normalization: {test_loss0}")
# print(f"Test Accuracy without Batch Normalization: {test_accuracy0}")
# print('')
# print(f"Test Loss with Batch Normalization: {test_loss1}")
# print(f"Test Accuracy with Batch Normalization: {test_accuracy1}")

# # print(f"Test Loss for dropout rate: 0.2: {test_loss2}")
# # print(f"Test Accuracy for dropout rate: 0.2: {test_accuracy2}")

# # print(f"Test Loss for dropout rate: 0.3: {test_loss3}")
# # print(f"Test Accuracy for dropout rate: 0.3: {test_accuracy3}")


# # print(f"Test Loss for dropout rate: 0.4: {test_loss4}")
# # print(f"Test Accuracy for dropout rate: 0.4: {test_accuracy4}")

# #


input_ids_test = tokens_test['input_ids'].numpy()
attention_mask_test = tokens_test['attention_mask'].numpy()

test_results = model.evaluate(
    x={'input_ids': input_ids_test, 'attention_mask': attention_mask_test},
    y=y_test,
    batch_size=32,
    verbose=1
)

print(f"Test Loss: {test_results[0]}")
print(f"Test Accuracy: {test_results[1]}")

In [None]:
# 