In [1]:
import pandas as pd

In [2]:
#load data
df=pd.read_csv('final_cleaned.csv')

In [3]:
#view
df.head()

Unnamed: 0.1,Unnamed: 0,Review Text,Rating
0,0,Not great speakers,3.0
1,1,Great little gagit,4.0
2,2,Awesome 👏🏽,5.0
3,3,Love my Echo,5.0
4,4,Great device,5.0


In [4]:
import re

# Example: cleaning the review text

def clean_review(text):
    if pd.isnull(text):
        return ""

    # Remove emojis
    text = re.sub(r'[^\w\s,]', '', text.encode('ascii', 'ignore').decode('ascii'))
    
    # Remove special characters and digits (keep letters and basic punctuation if needed)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# Apply to the column
df['Review'] = df['Review Text'].astype(str).apply(clean_review)

# View cleaned data
print(df[['Review Text', 'Review']].head())

          Review Text              Review
0  Not great speakers  not great speakers
1  Great little gagit  great little gagit
2          Awesome 👏🏽             awesome
3        Love my Echo        love my echo
4        Great device        great device


In [5]:
#checking conflicting reviews
from textblob import TextBlob

# Get sentiment polarity for each review (-1 = negative, 1 = positive)
df['Polarity'] = df['Review'].apply(lambda x: TextBlob(x).sentiment.polarity)

# Optional: classify sentiment direction
def classify_sentiment(p):
    if p > 0.2:
        return 'positive'
    elif p < -0.2:
        return 'negative'
    else:
        return 'neutral'

df['Sentiment'] = df['Polarity'].apply(classify_sentiment)

In [6]:
# Define conflict conditions
conflict_mask = (
    ((df['Rating'] >= 4) & (df['Sentiment'] == 'negative')) |
    ((df['Rating'] <= 2) & (df['Sentiment'] == 'positive'))
)

# Get conflicting reviews for optional inspection
conflicting_reviews = df[conflict_mask]

# Remove conflicting reviews
cleaned_df = df[~conflict_mask].reset_index(drop=True)

print("Original size:", len(df))
print("After removing conflicts:", len(cleaned_df))


Original size: 86136
After removing conflicts: 83481


In [7]:
#drop unnecessary columns
cleaned_df.drop(['Unnamed: 0','Review Text','Polarity','Sentiment'],axis=1,inplace=True)

In [8]:
#view
cleaned_df.head()

Unnamed: 0,Rating,Review
0,3.0,not great speakers
1,4.0,great little gagit
2,5.0,awesome
3,5.0,love my echo
4,5.0,great device


In [10]:
#check nulls
cleaned_df.isnull().sum()

Rating    0
Review    0
dtype: int64

In [12]:
#check duplicates
cleaned_df.duplicated().sum()

np.int64(1843)

In [13]:
#remove duplicates
cleaned_df.drop_duplicates(inplace=True)

In [14]:
#rating class distribution
cleaned_df['Rating'].value_counts().sort_index()

Rating
1.0     7578
2.0     3274
3.0     7732
4.0    16930
5.0    46124
Name: count, dtype: int64

In [15]:
# loading the balanced dataset created
balanced_df=pd.read_csv('balanced_data_1.csv')

In [16]:
# Remove balanced data from cleaned_combined_df
remaining_df = pd.concat([cleaned_df, balanced_df]).drop_duplicates(keep=False)

print("Remaining dataset size:", len(remaining_df))
print(remaining_df['Rating'].value_counts())

Remaining dataset size: 96567
Rating
5.0    49105
4.0    19913
3.0    10718
1.0    10560
2.0     6271
Name: count, dtype: int64


In [17]:
# Step 3: Define target distribution
target_size = 10000

distribution = {
    1.0: int(target_size * 0.10),  # 10%
    2.0: int(target_size * 0.15),  # 15%
    3.0: int(target_size * 0.25),  # 25%
    4.0: int(target_size * 0.30),  # 30%
    5.0: int(target_size * 0.20),  # 20%
}

In [18]:
# Step 4: Sample based on target distribution
unbalanced_samples = []

for rating, count in distribution.items():
    subset = remaining_df[remaining_df['Rating'] == rating]
    available = len(subset)
    if available >= count:
        sample = subset.sample(n=count, random_state=42)
    else:
        print(f"⚠️ Only {available} samples available for Rating {rating}, requested {count}")
        sample = subset  # take all available
    unbalanced_samples.append(sample)

# Combine into final unbalanced dataset
unbalanced_df = pd.concat(unbalanced_samples).reset_index(drop=True)

# Final check
print(unbalanced_df['Rating'].value_counts(normalize=True) * 100)

Rating
4.0    30.0
3.0    25.0
5.0    20.0
2.0    15.0
1.0    10.0
Name: proportion, dtype: float64


In [20]:
#view
unbalanced_df.head()

Unnamed: 0.1,Rating,Review,Unnamed: 0
0,1.0,selfie camera is too worst and the images comi...,
1,1.0,very bad product does not work properly even a...,
2,1.0,just bought apple iphone s on monday i am disa...,
3,1.0,so far having a lot of trouble i am a amazon p...,5738.0
4,1.0,this phone is not worth for k amount as featur...,


In [22]:
#remove unnecessary column
unbalanced_df.drop(['Unnamed: 0'],axis=1,inplace=True)

In [23]:
#view
unbalanced_df.head()

Unnamed: 0,Rating,Review
0,1.0,selfie camera is too worst and the images comi...
1,1.0,very bad product does not work properly even a...
2,1.0,just bought apple iphone s on monday i am disa...
3,1.0,so far having a lot of trouble i am a amazon p...
4,1.0,this phone is not worth for k amount as featur...


In [24]:
#save unbalanced file
unbalanced_df.to_csv("unbalanced_dataset.csv", index=False)