In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from collections import Counter

# Sample data (You would replace this with your actual dataset)
data = {
    'text': [
        'Sample text 1', 'Sample text 2', 'Sample text 3',
        'Sample text 4', 'Sample text 5', 'Sample text 6',
        'Sample text 7', 'Sample text 8', 'Sample text 9'
    ],
    'label': ['majority', 'majority', 'majority', 'majority', 'majority', 'minority', 'minority', 'majority', 'minority']
}

df = pd.DataFrame(data)
print("Original class distribution:", Counter(df['label']))


Original class distribution: Counter({'majority': 6, 'minority': 3})


In [2]:
# Split the dataset into training and testing sets
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])

print("Training class distribution:", Counter(df_train['label']))


Training class distribution: Counter({'majority': 5, 'minority': 2})


# **Oversampling the monority class:**

In [3]:
# Separate majority and minority classes
df_majority = df_train[df_train.label == 'majority']
df_minority = df_train[df_train.label == 'minority']

# Double the minority class
df_minority_oversampled = resample(df_minority,
                                   replace=True,   # Sample with replacement
                                   n_samples=len(df_minority) * 2,  # Double the minority class
                                   random_state=42)  # Reproducibility

# Combine majority class with oversampled minority class
df_train_oversampled = pd.concat([df_majority, df_minority_oversampled])

# Shuffle the oversampled dataset
df_train_oversampled = df_train_oversampled.sample(frac=1, random_state=42).reset_index(drop=True)

print("Oversampled Training class distribution:", Counter(df_train_oversampled['label']))


Oversampled Training class distribution: Counter({'majority': 5, 'minority': 4})


In [5]:
df_train_oversampled.head(9)

Unnamed: 0,text,label
0,Sample text 7,minority
1,Sample text 3,majority
2,Sample text 7,minority
3,Sample text 1,majority
4,Sample text 7,minority
5,Sample text 5,majority
6,Sample text 2,majority
7,Sample text 8,majority
8,Sample text 9,minority


In [9]:
# Example: Train a model (e.g., RandomForestClassifier)
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

# Features and labels
X_train = df_train_oversampled['text']  # Replace with actual feature columns
y_train = df_train_oversampled['label'].map({'majority': 0, 'minority': 1})  # Map labels to 0 and 1

# Features and labels for testing
X_test = df_test['text']
y_test = df_test['label'].map({'majority': 0, 'minority': 1})

# Vectorize the text data
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)
# Train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.50      1.00      0.67         1
           1       0.00      0.00      0.00         1

    accuracy                           0.50         2
   macro avg       0.25      0.50      0.33         2
weighted avg       0.25      0.50      0.33         2



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
print(X_test,y_pred)

  (0, 1)	0.7071067811865475
  (0, 0)	0.7071067811865475
  (1, 1)	0.7071067811865475
  (1, 0)	0.7071067811865475 [0 0]


# **Data Augmentation using nlpaug library:**

In [14]:
!pip install nlpaug transformers



In [21]:
import nlpaug.augmenter.word as naw

# Initialize augmenters
contextual_aug = naw.ContextualWordEmbsAug(
    model_path='bert-base-uncased',
    action='substitute',
    aug_p=0.1# default
)

synonym_aug = naw.SynonymAug()#(aug_p=0.1)

# Sample text
text = 'The quick brown fox jumps over the lazy dog.'

# Apply augmentations
text = contextual_aug.augment(text)
print("After ContextualWordEmbsAug:", text)

text = synonym_aug.augment(text)
print("After SynonymAug:", text)


After ContextualWordEmbsAug: ['the cheshire brown fox jumps over the lazy dog.']
After SynonymAug: ['the cheshire brownish fox jumps over the faineant dog iron.']


# **Custom dataset balance using naw:**

In [27]:
import nlpaug.augmenter.word as naw
import pandas as pd

# Initialize augmenters
contextual_aug = naw.ContextualWordEmbsAug(
    model_path='bert-base-uncased',
    action='substitute',
    aug_p=0.1
)

synonym_aug = naw.SynonymAug(aug_p=0.1)

# Create example dataset with 10 positive and 2 negative reviews
data = {
    'text': [
        'This product is amazing, I love it!',
        'Absolutely fantastic! Highly recommended.',
        'Great quality and value for money.',
        'I am very satisfied with my purchase.',
        'This is the best product I have ever bought.',
        'Exceeded my expectations, will buy again.',
        'The performance of this product is superb.',
        'Perfect for my needs, highly satisfied.',
        'Exceptional product, worth every penny.',
        'Very happy with the quality and service.',
        'Not worth the money, very disappointing.',
        'Terrible quality, would not recommend.'
    ],
    'label': ['positive'] * 10 + ['negative'] * 2
}
df = pd.DataFrame(data)

In [28]:
df

Unnamed: 0,text,label
0,"This product is amazing, I love it!",positive
1,Absolutely fantastic! Highly recommended.,positive
2,Great quality and value for money.,positive
3,I am very satisfied with my purchase.,positive
4,This is the best product I have ever bought.,positive
5,"Exceeded my expectations, will buy again.",positive
6,The performance of this product is superb.,positive
7,"Perfect for my needs, highly satisfied.",positive
8,"Exceptional product, worth every penny.",positive
9,Very happy with the quality and service.,positive


In [29]:
# Separate positive and negative reviews
df_positive = df[df['label'] == 'positive']
df_negative = df[df['label'] == 'negative']

# Number of positive and negative reviews
num_positive = len(df_positive)
num_negative = len(df_negative)


In [31]:
desired_minority_count=2*num_negative
required_augmentations=desired_minority_count-num_negative

In [32]:
# Function to augment texts until the desired number of samples is reached
def augment_texts_until_count(texts, augmenter, count):
    augmented_texts = []
    while len(augmented_texts) < count:
        augmented_batch = augmenter.augment(texts)
        if not augmented_batch:
            break
        augmented_texts.extend(augmented_batch)
        if len(augmented_texts) > count:
            augmented_texts = augmented_texts[:count]
    return augmented_texts

# Augment minority class texts
augmented_texts = []
augmented_texts.extend(augment_texts_until_count(df_minority['text'].tolist(), contextual_aug, required_augmentations))
augmented_texts.extend(augment_texts_until_count(df_minority['text'].tolist(), synonym_aug, required_augmentations))

# Create DataFrame for augmented texts
df_augmented_minority = pd.DataFrame({
    'text': augmented_texts,
    'label': ['minority'] * len(augmented_texts)
})

# Combine with the original data
df_combined = pd.concat([df_majority, df_augmented_minority], ignore_index=True)

# Ensure the desired number of minority samples
minority_count = len(df_combined[df_combined['label'] == 'minority'])
if minority_count < desired_minority_count:
    print(f"Warning: Only {minority_count} minority samples were obtained instead of {desired_minority_count}.")
elif minority_count > desired_minority_count:
    df_combined = pd.concat([
        df_combined[df_combined['label'] == 'majority'],
        df_combined[df_combined['label'] == 'minority'].sample(n=desired_minority_count, random_state=42)
    ], ignore_index=True)

# Print the balanced dataset
print(df_combined)
print(f"Number of minority samples: {len(df_combined[df_combined['label'] == 'minority'])}")
print(f"Number of majority samples: {len(df_combined[df_combined['label'] == 'majority'])}")


                          text     label
0          The quick brown fox  majority
1          The quick brown fox  majority
2          The quick brown fox  majority
3          The quick brown fox  majority
4          The quick brown fox  majority
5          The quick brown fox  majority
6          The quick brown fox  majority
7          The quick brown fox  majority
8          The quick brown fox  majority
9          The quick brown fox  majority
10     in ipsum dolor sit amet  minority
11  lorem ipsum dolor for amet  minority
12  Lorem ipsum dolor sit amet  minority
13  Lorem ipsum dolor sit amet  minority
Number of minority samples: 4
Number of majority samples: 10
