<a href="https://colab.research.google.com/github/roberthouston14/GNN-Class/blob/main/Data_Spinner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# This code,  first loads the BERT model and tokenizer using the Hugging Face Transformers library. 
# It then loads a combined dataset  previously created. It defines a function get_synonym that uses 
# the WordNet corpus from NLTK to generate synonyms for a given word. We also define a function spin_text 
# that takes a piece of text and replaces a specified number of words with synonyms using the get_synonym function.

# It then loops through each spam sample in the dataset and use the spin_text function 
# to create a new, spun version of the text. We then use the BERT model to generate a 
# new spam sample based on the spun text using the nlp pipeline from the Transformers 
# library. Finally, we concatenate the original dataset with the new samples and 
# save the result to a new CSV file.

from transformers import pipeline, AutoTokenizer
import pandas as pd
import random
import nltk
from nltk.corpus import wordnet

# load BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
nlp = pipeline('text-generation', model=model_name, tokenizer=tokenizer)

# load the combined dataset
df = pd.read_csv('combined_dataset.csv')

# function to generate synonym using WordNet
def get_synonym(word):
    synonyms = []
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.append(lemma.name())
    return random.choice(synonyms) if synonyms else word

# function to spin text by replacing words with synonyms
def spin_text(text, num_replacements=3):
    words = nltk.word_tokenize(text.lower())
    replace_indices = random.sample(range(len(words)), num_replacements)
    for idx in replace_indices:
        words[idx] = get_synonym(words[idx])
    return ' '.join(words)

# generate new spam samples using BERT and spinner
new_samples = []
for index, row in df.iterrows():
    if row['target'] == 1:  # only spin text for spam samples
        text = row['Text']
        spin_text = spin_text(text)
        generated = nlp(spin_text, max_length=len(text), do_sample=True, top_p=0.8, temperature=0.7)[0]['generated_text']
        new_samples.append({'Text': generated, 'target': 1})

# concatenate the original and new datasets
new_df = pd.concat([df, pd.DataFrame(new_samples)], ignore_index=True)

# save the combined dataset with new samples to a CSV file
new_df.to_csv('combined_dataset_with_spun_samples.csv', index=False)
