In [1]:
!pip install datasets
!pip install transformers

# Required Libraries
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.pairwise import cosine_similarity
from keras.models import Sequential, Model
from keras.layers import Dense, LeakyReLU, BatchNormalization, Input
from keras.optimizers import Adam
import numpy as np
import pandas as pd
from datasets import load_dataset
from transformers import BertTokenizer, BertModel, GPT2LMHeadModel, GPT2Tokenizer
import torch


Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, datasets
Successfully installed datasets-2.16.1 dill-0.3.7 multiprocess-0.70.15


In [2]:
# Initialize the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Initialize GPT-2 model and tokenizer
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [3]:
# Load the Financial Phrasebank dataset
dataset = load_dataset('financial_phrasebank', 'sentences_50agree')

# Extract original sentences and labels
dataset_sentences = [item['sentence'] for item in dataset['train']]
original_labels = [item['label'] for item in dataset['train']]

Downloading data:   0%|          | 0.00/392k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4846 [00:00<?, ? examples/s]

In [4]:
# Convert text to BERT embeddings function
def text_to_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs['last_hidden_state'].mean(dim=1).squeeze().numpy()


In [5]:
# Convert the dataset to embeddings
embeddings = np.array([text_to_embedding(sentence) for sentence in dataset_sentences])

# Perform clustering
num_clusters = 5
clustering = AgglomerativeClustering(n_clusters=num_clusters).fit(embeddings)


In [6]:
# Define the function to generate text from embeddings
def generate_text_from_embedding(embedding):
    similarities = cosine_similarity([embedding], embeddings)
    closest_index = np.argmax(similarities)
    closest_sentence = dataset_sentences[closest_index]
    generated_text = generate_text(closest_sentence)
    return generated_text

In [7]:
# Define the function to generate text using GPT-2
def generate_text(seed_text, max_length=50):
    inputs = gpt2_tokenizer.encode(seed_text, return_tensors="pt")
    with torch.no_grad():
        outputs = gpt2_model.generate(inputs, max_length=max_length, num_return_sequences=1)
    decoded_output = gpt2_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return decoded_output


In [8]:
def embedding_to_text(embedding):
    # Find the closest sentence in the original dataset to the embedding
    similarities = cosine_similarity([embedding], embeddings)
    closest_index = np.argmax(similarities)
    closest_sentence = dataset['train'][closest_index]['sentence']
    return closest_sentence

In [9]:
# Function to find the closest original sentence to a given embedding
def find_closest_sentence(embedding, original_embeddings, dataset_sentences):
    similarities = cosine_similarity([embedding], original_embeddings)
    closest_index = np.argmax(similarities)
    return dataset_sentences[closest_index]

In [10]:

def build_generator(input_dim):
    model = Sequential()
    model.add(Dense(128, input_dim=input_dim))
    model.add(LeakyReLU(0.2))
    model.add(BatchNormalization(momentum=0.8))
    model.add(Dense(256))
    model.add(LeakyReLU(0.2))
    model.add(BatchNormalization(momentum=0.8))
    model.add(Dense(512))
    model.add(LeakyReLU(0.2))
    model.add(BatchNormalization(momentum=0.8))
    model.add(Dense(input_dim, activation='tanh'))
    noise = Input(shape=(input_dim,))
    embedding = model(noise)
    return Model(noise, embedding)

def build_discriminator(input_dim):
    model = Sequential()
    model.add(Dense(512, input_dim=input_dim))
    model.add(LeakyReLU(0.2))
    model.add(Dense(256))
    model.add(LeakyReLU(0.2))
    model.add(Dense(128))
    model.add(LeakyReLU(0.2))
    model.add(Dense(1, activation='sigmoid'))
    embedding = Input(shape=(input_dim,))
    validity = model(embedding)
    return Model(embedding, validity)

In [11]:
# Introduce mutation function with sentence visualization
def introduce_mutation(embedding, mutation_strength=0.01):
    # Introduces a random mutation in the embedding.
    original_sentence = find_closest_sentence(embedding, embeddings, dataset_sentences)
    print("Original sentence:", original_sentence)
    mutation = np.random.normal(0, mutation_strength, embedding.shape)
    mutated_embedding = embedding + mutation
    mutated_sentence = find_closest_sentence(mutated_embedding, embeddings, dataset_sentences)
    print("Mutated sentence:", mutated_sentence)
    return mutated_embedding



In [12]:
def select_top_embeddings(embeddings, reference_embeddings, top_percent=0.5):
    print("Sentences before selection:")
    for emb in embeddings:
        print(find_closest_sentence(emb, embeddings, dataset_sentences))

    similarity_scores = cosine_similarity(embeddings, reference_embeddings).mean(axis=1)
    sorted_indices = np.argsort(similarity_scores)[::-1]
    num_selected = int(top_percent * len(embeddings))
    selected_embeddings = embeddings[sorted_indices[:num_selected]]

    print("Sentences after selection:")
    for emb in selected_embeddings:
        print(find_closest_sentence(emb, embeddings, dataset_sentences))

    return selected_embeddings

In [13]:
def recombine_embeddings(embedding1, embedding2):
    parent1_sentence = find_closest_sentence(embedding1, embeddings, dataset_sentences)
    parent2_sentence = find_closest_sentence(embedding2, embeddings, dataset_sentences)
    print("Parent 1 sentence:", parent1_sentence)
    print("Parent 2 sentence:", parent2_sentence)

    crossover_point = np.random.randint(embedding1.shape[0])
    new_embedding = np.hstack([embedding1[:crossover_point], embedding2[crossover_point:]])

    child_sentence = find_closest_sentence(new_embedding, embeddings, dataset_sentences)
    print("Child sentence:", child_sentence)
    return new_embedding

In [14]:

def wright_fisher_sampling(embeddings, fitness_scores, population_size):
    # Sample a new generation of embeddings based on fitness scores using the Wright-Fisher model.
    # Ensure non-negative fitness scores by adding an offset
    offset_fitness_scores = fitness_scores - np.min(fitness_scores) + 1e-8
    normalized_fitness = offset_fitness_scores / np.sum(offset_fitness_scores)
    sampled_indices = np.random.choice(len(embeddings), size=population_size, p=normalized_fitness)
    return embeddings[sampled_indices]

In [15]:
# Modified portion of the train_gan_with_coalescent function to address the size mismatch

def train_gan_with_coalescent(cluster_embeddings, epochs=10000, batch_size=128, selection_pressure=0.5):
    discriminator = build_discriminator(cluster_embeddings.shape[1])
    discriminator.compile(loss='binary_crossentropy', optimizer=Adam(0.0002, 0.5), metrics=['accuracy'])

    generator = build_generator(cluster_embeddings.shape[1])

    z = Input(shape=(cluster_embeddings.shape[1],))
    embedding = generator(z)
    discriminator.trainable = False
    validity = discriminator(embedding)
    combined = Model(z, validity)
    combined.compile(loss='binary_crossentropy', optimizer=Adam(0.0002, 0.5))

    lineage = {i: [i] for i in range(cluster_embeddings.shape[0])}

    current_population = cluster_embeddings.copy()

    for epoch in range(epochs):
        noise = np.random.normal(0, 1, (batch_size, cluster_embeddings.shape[1]))
        synthetic_embeddings = generator.predict(noise)

        selected_embeddings = select_top_embeddings(synthetic_embeddings, current_population, top_percent=selection_pressure)

        offspring_embeddings = []
        for _ in range(batch_size // 2):
            indices = np.random.choice(len(selected_embeddings), 2, replace=False)
            parent1 = selected_embeddings[indices[0]]
            parent2 = selected_embeddings[indices[1]]
            child1 = recombine_embeddings(parent1, parent2)
            child2 = recombine_embeddings(parent2, parent1)
            offspring_embeddings.extend([child1, child2])

        offspring_embeddings = np.array([introduce_mutation(e) for e in offspring_embeddings])

        similarity_scores = cosine_similarity(offspring_embeddings, current_population).mean(axis=1)
        current_population = wright_fisher_sampling(offspring_embeddings, similarity_scores, cluster_embeddings.shape[0])

        # Adjusting the valid and fake labels' size to match the current_population size
        valid = np.ones((current_population.shape[0], 1))
        fake = np.zeros((offspring_embeddings.shape[0], 1))

        d_loss_real = discriminator.train_on_batch(current_population, valid)
        d_loss_fake = discriminator.train_on_batch(offspring_embeddings, fake)
        d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

        noise = np.random.normal(0, 1, (batch_size, cluster_embeddings.shape[1]))
        g_loss = combined.train_on_batch(noise, np.ones((batch_size, 1)))

        if epoch % 1000 == 0:
            print(f'{epoch}/{epochs} [D loss: {d_loss[0]} | D accuracy: {100 * d_loss[1]}] [G loss: {g_loss}]')

    return generator, lineage

# The modification ensures that the label sizes match the data sizes during training.




In [16]:

def trace_lineage(embedding_index, lineage):
    # Traces back the lineage of an embedding.
    return lineage[embedding_index]

synthetic_data = []


In [17]:
for cluster in range(num_clusters):
    cluster_indices = np.where(clustering.labels_ == cluster)[0]
    cluster_embeddings = embeddings[cluster_indices]
    generator, lineage = train_gan_with_coalescent(cluster_embeddings, epochs=20)  # Reduced for quick testing
    num_synthetic_points = len(cluster_indices)
    noise = np.random.normal(0, 1, (num_synthetic_points, cluster_embeddings.shape[1]))
    synthetic_embeddings = generator.predict(noise)
    synthetic_data.append(synthetic_embeddings)

synthetic_data = np.vstack(synthetic_data)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
TeliaSonera TLSN said the offer is in line with its strategy to increase its ownership in core business holdings and would strengthen Eesti Telekom 's offering to its customers .
STORA ENSO , NORSKE SKOG , M-REAL , UPM-KYMMENE Credit Suisse First Boston ( CFSB ) raised the fair value for shares in four of the largest Nordic forestry groups .
A purchase agreement for 7,200 tons of gasoline with delivery at the Hamina terminal , Finland , was signed with Neste Oil OYj at the average Platts index for this September plus eight US dollars per month .
Finnish Talentum reports its operating profit increased to EUR 20.5 mn in 2005 from EUR 9.3 mn in 2004 , and net sales totaled EUR 103.3 mn , up from EUR 96.4 mn .
Clothing retail chain Sepp+ñl+ñ 's sales increased by 8 % to EUR 155.2 mn , and operating profit rose to EUR 31.1 mn from EUR 17.1 mn in 2004 .
Consolidated net sales increased 16 % to reach EUR74 .8 m , while operating

In [18]:
new_sentences = []
new_labels = []
counter = 0
limit = 20
for synthetic_point in synthetic_data:
    # if counter >= limit:
    #     break
    similarities = cosine_similarity([synthetic_point], embeddings)
    closest_index = int(np.argmax(similarities))
    closest_sentence = dataset['train'][closest_index]['sentence']
    closest_label = original_labels[closest_index]
    new_sentence = generate_text(closest_sentence)
    new_sentences.append(new_sentence)
    new_labels.append(closest_label)
    counter += 1

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Set

In [19]:
synthetic_dataset = pd.DataFrame({
    'Synthetic_Sentences': new_sentences,
    'Labels': new_labels
})
synthetic_dataset

Unnamed: 0,Synthetic_Sentences,Labels
0,In addition to verification of an identity and...,1
1,"When the situation normalises, the company wil...",2
2,Commencing the construction works of Pearl Pla...,2
3,The company did not disclose the price of the ...,1
4,There are no substitutes to AC drives... but t...,1
...,...,...
4841,Jeder Beta-Tester erh+ñlt kostenlos sechs Mona...,1
4842,The maritime administration said the ships had...,0
4843,In 2007 Talentum will disclose three Interim R...,1
4844,"Arto Ryymin, born 1964, will replace Juhani Ka...",1


In [20]:
csv_file_path = '/content/generated_dataset.csv'

# Save the DataFrame to a CSV file
synthetic_dataset.to_csv(csv_file_path, index=False)

In [21]:
from google.colab import files

# Trigger the download of the CSV file
files.download(csv_file_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [22]:
def calculate_similarity(original_sentences, synthetic_sentences):
    original_embeddings = np.array([text_to_embedding(sentence) for sentence in original_sentences])
    average_similarities = []

    for synthetic_sentence in synthetic_sentences:
        synthetic_embedding = text_to_embedding(synthetic_sentence)
        similarities = cosine_similarity([synthetic_embedding], original_embeddings)
        max_similarity = np.max(similarities)
        average_similarities.append(max_similarity)

    return np.mean(average_similarities)

# Random forest


In [44]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, f1_score
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# nltk downloads
nltk.download('punkt')

# Simplified Text Preprocessing
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    return ' '.join(tokens)

def preprocess_data(data):
    return data.apply(preprocess_text)

# Evaluate classifier function with extensive hyperparameter tuning
def evaluate_classifier(X_train, y_train, X_test, y_test):
    pipeline = ImbPipeline([
        ('vectorizer', TfidfVectorizer(ngram_range=(1, 2))),
        ('smote', SMOTE(random_state=42)),
        ('classifier', SVC())
    ])

    param_grid = {
        'classifier__C': [0.1, 1, 10],
        'classifier__gamma': ['scale', 'auto'],
        'classifier__kernel': ['rbf', 'linear', 'poly']
    }

    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1_weighted', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_

    y_pred = best_model.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='weighted')
    report = classification_report(y_test, y_pred)

    print("Best Model Parameters: ", grid_search.best_params_)
    print("F1 Score: {:.4f}".format(f1))
    print("\nClassification Report:")
    print(report)
    return f1

def compare_datasets(original_data_dict, synthetic_data_dict):
    original_data = pd.DataFrame(original_data_dict)
    synthetic_data = pd.DataFrame(synthetic_data_dict)

    X_original, y_original = original_data['sentences'], original_data['labels']
    X_synthetic, y_synthetic = synthetic_data['sentences'], synthetic_data['labels']

    X_original = preprocess_data(X_original)
    X_synthetic = preprocess_data(X_synthetic)

    label_encoder = LabelEncoder()
    y_original_encoded = label_encoder.fit_transform(y_original)
    y_synthetic_encoded = label_encoder.transform(y_synthetic)

    X_train_original, X_test_original, y_train_original, y_test_original = train_test_split(
        X_original, y_original_encoded, test_size=0.2, random_state=42
    )
    X_train_synthetic, X_test_synthetic, y_train_synthetic, y_test_synthetic = train_test_split(
        X_synthetic, y_synthetic_encoded, test_size=0.2, random_state=42
    )

    print("Original Dataset Metrics:")
    f1_original = evaluate_classifier(
        X_train_original, y_train_original, X_test_original, y_test_original
    )

    print("\nSynthetic Dataset Metrics:")
    f1_synthetic = evaluate_classifier(
        X_train_synthetic, y_train_synthetic, X_test_synthetic, y_test_synthetic
    )

    return {
        "f1_original": f1_original,
        "f1_synthetic": f1_synthetic,
    }

# Example usage
original_dataset = {'sentences': dataset_sentences, 'labels': original_labels}
synthetic_dataset = {'sentences': new_sentences, 'labels': new_labels}

metrics = compare_datasets(original_dataset, synthetic_dataset)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Original Dataset Metrics:
Best Model Parameters:  {'classifier__C': 1, 'classifier__gamma': 'scale', 'classifier__kernel': 'linear'}
F1 Score: 0.7962

Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.65      0.70       110
           1       0.82      0.89      0.85       571
           2       0.77      0.67      0.72       289

    accuracy                           0.80       970
   macro avg       0.78      0.74      0.76       970
weighted avg       0.80      0.80      0.80       970


Synthetic Dataset Metrics:
Best Model Parameters:  {'classifier__C': 10, 'classifier__gamma': 'scale', 'classifier__kernel': 'linear'}
F1 Score: 0.9312

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.76      0.83        75
           1       0.94      0.99      0.96       742
           2       0.93      0.75      0.83       153

    accuracy                           0.93       

In [None]:
average_similarity = calculate_similarity(dataset_sentences, new_sentences)
print("Average Cosine Similarity:", average_similarity)