In [2]:
import pandas as pd
import random
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
import nltk

# Ensure NLTK resources are downloaded
nltk.download('wordnet')
nltk.download('punkt')

# Load the existing dataset
file_path = 'plant_disease_descriptions.csv'
df = pd.read_csv(file_path)

# Define a function for synonym replacement
def synonym_replacement(text):
    words = word_tokenize(text)
    new_words = []
    for word in words:
        synonyms = wordnet.synsets(word)
        if synonyms:
            new_word = random.choice(synonyms).lemmas()[0].name()
            new_words.append(new_word)
        else:
            new_words.append(word)
    return ' '.join(new_words)

# Define a function for random insertion
def random_insertion(text):
    words = word_tokenize(text)
    new_words = words.copy()
    synonyms = [word for word in words if wordnet.synsets(word)]
    if not synonyms:
        return text
    word_to_insert = random.choice(synonyms)
    insert_index = random.randint(0, len(new_words) - 1)
    new_words.insert(insert_index, word_to_insert)
    return ' '.join(new_words)

# Define a function for random deletion
def random_deletion(text, p=0.2):
    words = word_tokenize(text)
    if len(words) == 1:  # if the sentence has only one word, don't delete
        return text
    new_words = [word for word in words if random.random() > p]
    if len(new_words) == 0:  # ensure at least one word remains
        new_words = [random.choice(words)]
    return ' '.join(new_words)

# Define a function for random swap
def random_swap(text, n=5):
    words = word_tokenize(text)
    if len(words) < 2:
        return text
    for _ in range(n):
        idx1, idx2 = random.sample(range(len(words)), 2)
        words[idx1], words[idx2] = words[idx2], words[idx1]
    return ' '.join(words)

# Perform data augmentation
augmented_data = []

for _, row in df.iterrows():
    description = row['description']
    class_name = row['class']
    
    # Original description
    augmented_data.append({'description': description, 'class': class_name})

    # Synonym Replacement
    for _ in range(10):
        augmented_data.append({'description': synonym_replacement(description), 'class': class_name})

    # Random Insertion
    for _ in range(10):
        augmented_data.append({'description': random_insertion(description), 'class': class_name})

    # Random Deletion
    for _ in range(10):
        augmented_data.append({'description': random_deletion(description), 'class': class_name})

    # Random Swap
    for _ in range(10):
        augmented_data.append({'description': random_swap(description), 'class': class_name})

# Convert augmented data to DataFrame
augmented_df = pd.DataFrame(augmented_data)

# Ensure that we have at least 50 descriptions per class
min_descriptions_per_class = 50
final_data = []

for class_name in df['class'].unique():
    class_data = augmented_df[augmented_df['class'] == class_name]
    if len(class_data) < min_descriptions_per_class:
        # Randomly sample from the existing data if not enough
        additional_descriptions = class_data.sample(min_descriptions_per_class - len(class_data), replace=True)
        final_data.append(class_data)
        final_data.append(additional_descriptions)
    else:
        final_data.append(class_data)

# Concatenate all data and save to CSV
final_df = pd.concat(final_data)
final_df.to_csv('final_plant_description.csv', index=False)

print("Dataset augmentation complete and saved to 'final_plant_description.csv'")


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Dataset augmentation complete and saved to 'final_plant_description.csv'
