In [5]:
import pandas as pd

input_path = "flipkart_product.csv"
output_path = "flipkart_product_cleaned.csv"

df = pd.read_csv(input_path,encoding = 'latin-1')

# Normalize text columns so empty and whitespace-only entries are treated the same
for col in ["Review", "Summary"]:
    df[col] = df[col].fillna("").astype(str).str.strip()

# Drop rows where both Review and Summary are empty
empty_mask = (df["Review"] == "") & (df["Summary"] == "")
df = df.loc[~empty_mask].copy()

# Remove exact duplicate rows
df = df.drop_duplicates()

# Validate Rate and keep only integers in {1,2,3,4,5}
df["Rate"] = pd.to_numeric(df["Rate"], errors="coerce")
df = df[df["Rate"].isin([1, 2, 3, 4, 5])]
df["Rate"] = df["Rate"].astype(int)
if "Price" in df.columns:
    df["Price"] = df["Price"].astype(str).str.replace(r'[^\d]', '', regex=True)
    df["Price"] = pd.to_numeric(df["Price"], errors='coerce')

print("Cleaned shape:", df.shape)
print("Rate distribution:\n", df["Rate"].value_counts().sort_index())

Cleaned shape: (165008, 5)
Rate distribution:
 Rate
1    18782
2     6015
3    13946
4    33207
5    93058
Name: count, dtype: int64


In [6]:
df["text"] = df["Summary"] + " " + df["Review"]

In [7]:
import re

# Emoji normalization uses emoji.demojize when available; falls back to no-op if package missing
try:
    import emoji  # type: ignore
except ImportError:
    emoji = None

url_pattern = re.compile(r"https?://\S+|www\.\S+", re.IGNORECASE)
html_pattern = re.compile(r"<[^>]+>")
whitespace_pattern = re.compile(r"[\n\t\r]+")


def normalize_emojis(text: str) -> str:
    if emoji is None:
        return text
    # Convert emojis to :smile: style text, then strip punctuation-like colons/underscores
    demojized = emoji.demojize(text, language="en")
    demojized = demojized.replace(":", " ")
    demojized = re.sub(r"_+", " ", demojized)
    return demojized


def clean_text_value(text) -> str:
    if not isinstance(text, str):
        text = "" if pd.isna(text) else str(text)
    text = text.lower()
    text = html_pattern.sub(" ", text)
    text = url_pattern.sub(" ", text)
    text = whitespace_pattern.sub(" ", text)
    text = normalize_emojis(text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

# Apply text cleaning to relevant columns
text_cols = [col for col in ["Summary", "Review", "text"] if col in df.columns]
for col in text_cols:
    df[col] = df[col].apply(clean_text_value)



In [8]:
def map_sentiment(r):
    if r >= 4: return "Positive"
    if r == 3: return "Neutral"
    return "Negative"

df["sentiment"] = df["Rate"].apply(map_sentiment)
df.to_csv(output_path, index=False)