In [2]:
import pandas as pd
from imblearn.over_sampling import RandomOverSampler

# Load the CSV file
data = pd.read_csv('amazon_reviews.csv')

# Select only the desired columns
data = data[['id', 'overall', 'reviewText']]

# Convert 'overall' column to numeric
data['overall'] = pd.to_numeric(data['overall'], errors='coerce')

# Remove reviews with 'overall' values outside the range of 1 to 5
data = data[data['overall'].between(1, 5, inclusive=True)]

# Handle missing or invalid values in the review text column
data['reviewText'] = data['reviewText'].astype(str)

# Map sentiment labels: 1-2 -> Negative, 3 -> Neutral, 4-5 -> Positive
data['sentiment'] = data['overall'].map({1: 'Negative', 2: 'Negative', 3: 'Neutral', 4: 'Positive', 5: 'Positive'})

# Separate the features and labels
X = data['reviewText']
y = data['sentiment']

# Apply oversampling to the data
oversampler = RandomOverSampler(random_state=42)
X_resampled, y_resampled = oversampler.fit_resample(X.values.reshape(-1, 1), y)

# Convert the oversampled data back to a DataFrame
resampled_data = pd.DataFrame({'reviewText': X_resampled.flatten(), 'sentiment': y_resampled})

# Save the resampled data to a CSV file
resampled_data.to_csv('oversampled_data.csv', index=False)


  data = data[data['overall'].between(1, 5, inclusive=True)]
