In [6]:
import pandas as pd
import numpy as np

def add_noise(data, noise_level=0.05):
    noisy_data = data.copy()
    for column in noisy_data.columns[:-1]:  
        if noisy_data[column].dtype != 'object':
            noise = np.random.normal(0, noise_level, size=noisy_data[column].shape)
            noisy_data[column] += noise * noisy_data[column]
    return noisy_data

def scale_features(data, scale_range=(0.9, 1.1)):
    scaled_data = data.copy()
    for column in scaled_data.columns[:-1]:  
        if scaled_data[column].dtype != 'object':
            scale_factor = np.random.uniform(*scale_range, size=scaled_data[column].shape)
            scaled_data[column] *= scale_factor
    return scaled_data

def generate_synthetic_data(data, samples_per_class):
    synthetic_data = pd.DataFrame()
    for crop in data['label'].unique():
        crop_data = data[data['label'] == crop]
        for _ in range(samples_per_class - len(crop_data)):
            noisy_data = add_noise(crop_data.sample(1))
            scaled_data = scale_features(noisy_data)
            synthetic_data = pd.concat([synthetic_data, scaled_data], ignore_index=True)
    return synthetic_data

file_path = '1.csv'
data = pd.read_csv(file_path)

num_classes = len(data['label'].unique())
samples_per_class = 15000 // num_classes

synthetic_data = generate_synthetic_data(data, samples_per_class)

augmented_data = pd.concat([data, synthetic_data], ignore_index=True)

augmented_data = augmented_data.sample(frac=1).reset_index(drop=True)

augmented_data.to_csv('2.csv', index=False)

print("Total samples in augmented dataset:", len(augmented_data))
print("\nClass distribution in augmented dataset:")
print(augmented_data['label'].value_counts())

if len(augmented_data) > 20000:
    augmented_data = augmented_data.sample(n=20000, random_state=42)
elif len(augmented_data) < 10000:
    additional_samples_needed = 10000 - len(augmented_data)
    additional_samples = augmented_data.sample(n=additional_samples_needed, replace=True, random_state=42)
    augmented_data = pd.concat([augmented_data, additional_samples], ignore_index=True)

augmented_data.to_csv('2.csv', index=False)

print("\nFinal dataset size:", len(augmented_data))
print("\nFinal class distribution:")
print(augmented_data['label'].value_counts())

Total samples in augmented dataset: 14982

Class distribution in augmented dataset:
label
cotton         681
apple          681
mungbean       681
pomegranate    681
rice           681
mango          681
muskmelon      681
pigeonpeas     681
orange         681
grapes         681
watermelon     681
banana         681
maize          681
blackgram      681
coffee         681
chickpea       681
kidneybeans    681
papaya         681
mothbeans      681
coconut        681
jute           681
lentil         681
Name: count, dtype: int64

Final dataset size: 14982

Final class distribution:
label
cotton         681
apple          681
mungbean       681
pomegranate    681
rice           681
mango          681
muskmelon      681
pigeonpeas     681
orange         681
grapes         681
watermelon     681
banana         681
maize          681
blackgram      681
coffee         681
chickpea       681
kidneybeans    681
papaya         681
mothbeans      681
coconut        681
jute           681
lentil 