In [None]:
# ---- Step 1: Install profiling library (only once per runtime) ----
!pip install ydata-profiling --quiet

# ---- Step 2: Upload file ----
from google.colab import files
import pandas as pd

uploaded = files.upload()
filename = list(uploaded.keys())[0]

# ---- Step 3: Load dataset ----
df = pd.read_csv(filename)
print("✅ Dataset loaded:", df.shape)

# ---- Step 4: Check and drop duplicates ----
duplicates_before = df.duplicated().sum()
df = df.drop_duplicates()
duplicates_after = df.duplicated().sum()

print(f"🟡 Duplicate rows before cleaning: {duplicates_before}")
print(f"🟢 Duplicate rows after cleaning: {duplicates_after}")

# ---- Step 5: Clean price columns ----
for col in ['mrp', 'price_whole']:
    if col in df.columns:
        df[col] = (
            df[col]
            .astype(str)
            .str.replace(r'[^0-9.]', '', regex=True)  # remove ₹, commas, text
            .replace('', '0')                        # handle empty strings
        )
        df[col] = pd.to_numeric(df[col], errors='coerce')

# ---- Step 6: Add price-related features ----
if 'mrp' in df.columns and 'price_whole' in df.columns:
    df['discount_percent'] = ((df['mrp'] - df['price_whole']) / df['mrp'] * 100).round(2)
    df['is_discounted'] = (df['price_whole'] < df['mrp']).astype(int)
    print("✅ Added 'discount_percent' and 'is_discounted' columns")
else:
    print("⚠️ Columns 'mrp' and 'price_whole' not found, skipping feature engineering.")

# ---- Step 7: Save & Download cleaned dataset ----
cleaned_file = "Haldiram_cleaned_dataset.csv"
df.to_csv(cleaned_file, index=False)
files.download(cleaned_file)

print("📥 Cleaned dataset ready for download:", cleaned_file)

# ---- Step 8: Generate Profiling Report ----
from ydata_profiling import ProfileReport

profile = ProfileReport(df, title="Haldiram Dataset Profiling Report", explorative=True)
profile.to_notebook_iframe()   # show inside notebook

# Save report as HTML
profile.to_file("Haldiram_Profiling_Report.html")
files.download("Haldiram_Profiling_Report.html")

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/62.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.1/400.1 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.5/296.5 kB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m679.0/679.0 kB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m37.7/37.7 MB[0m [31m39.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.4/105.4 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.3/43.3 kB[0m [31m3.0 MB/s[0m eta [36

Saving Haldiram_Dataset_1000_entries.csv to Haldiram_Dataset_1000_entries.csv
✅ Dataset loaded: (1193, 12)
🟡 Duplicate rows before cleaning: 7
🟢 Duplicate rows after cleaning: 0
✅ Added 'discount_percent' and 'is_discounted' columns


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

📥 Cleaned dataset ready for download: Haldiram_cleaned_dataset.csv


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|          | 0/14 [00:00<?, ?it/s][A
  7%|▋         | 1/14 [00:00<00:01,  8.67it/s][A
 57%|█████▋    | 8/14 [00:00<00:00, 38.12it/s][A
100%|██████████| 14/14 [00:00<00:00, 37.39it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Step 1: Install Great Expectations with compatible versions
!pip install great_expectations pandas==2.2.2 numpy==1.26.4 --quiet

# Step 2: Restart the runtime after this cell (Runtime > Restart runtime)

In [None]:
import pandas as pd
from great_expectations.dataset import PandasDataset

# Load dataset
df = pd.read_csv("Haldiram_cleaned_dataset.csv")

# Wrap with GE
gx_df = PandasDataset(df)

# ✅ Only check price-related columns (not null)
price_columns = ["price_whole", "per_count_price", "mrp", "discount"]

for col in price_columns:
    gx_df.expect_column_values_to_not_be_null(col)

# Run validation
results = gx_df.validate()
print(results)

{
  "success": true,
  "results": [
    {
      "success": true,
      "expectation_config": {
        "expectation_type": "expect_column_values_to_not_be_null",
        "kwargs": {
          "column": "price_whole",
          "result_format": "BASIC"
        },
        "meta": {}
      },
      "result": {
        "element_count": 1186,
        "unexpected_count": 0,
        "unexpected_percent": 0.0,
        "unexpected_percent_total": 0.0,
        "partial_unexpected_list": []
      },
      "meta": {},
      "exception_info": {
        "raised_exception": false,
        "exception_message": null,
        "exception_traceback": null
      }
    },
    {
      "success": true,
      "expectation_config": {
        "expectation_type": "expect_column_values_to_not_be_null",
        "kwargs": {
          "column": "per_count_price",
          "result_format": "BASIC"
        },
        "meta": {}
      },
      "result": {
        "element_count": 1186,
        "unexpected_count": 0,
  

In [None]:
pip install great-expectations



In [None]:
import pandas as pd
import great_expectations as gx

# -------------------------
# 1. Load your dataset
# -------------------------
df = pd.read_csv("HaldiramCleanedDataset.csv")   # change path to your file

# -------------------------
# 2. Create Great Expectations context
# -------------------------
# Ephemeral mode = keeps everything in memory (no project folder needed)
context = gx.get_context(mode="ephemeral")

# Register dataframe as datasource
datasource = context.sources.add_pandas(name="my_datasource")
asset = datasource.add_dataframe_asset(name="my_dataframe")
batch_request = asset.build_batch_request(dataframe=df)

# -------------------------
# 3. Create an Expectation Suite
# -------------------------
suite = context.suites.add("product_schema_suite")

# Create a Validator (this is where we attach expectations)
validator = context.get_validator(
    batch_request=batch_request,
    expectation_suite=suite,
)

# -------------------------
# 4. Define schema expectations
# -------------------------

# Expect exact columns in this order
validator.expect_table_columns_to_match_ordered_list(
    column_list=[
        "product_name", "category", "rating", "sales", "price_whole",
        "per_count_price", "mrp", "discount", "review_text",
        "number_of_global_ratings", "number_of_reviews", "product_weight",
        "discount_percent", "is_discounted"
    ]
)

# Expect data types
validator.expect_column_values_to_be_of_type("product_name", "object")
validator.expect_column_values_to_be_of_type("category", "object")
validator.expect_column_values_to_be_of_type("rating", "float64")
validator.expect_column_values_to_be_of_type("sales", "object")
validator.expect_column_values_to_be_of_type("price_whole", "int64")
validator.expect_column_values_to_be_of_type("per_count_price", "int64")
validator.expect_column_values_to_be_of_type("mrp", "int64")
validator.expect_column_values_to_be_of_type("discount", "object")
validator.expect_column_values_to_be_of_type("review_text", "object")
validator.expect_column_values_to_be_of_type("number_of_global_ratings", "int64")
validator.expect_column_values_to_be_of_type("number_of_reviews", "int64")
validator.expect_column_values_to_be_of_type("product_weight", "object")
validator.expect_column_values_to_be_of_type("discount_percent", "float64")
validator.expect_column_values_to_be_of_type("is_discounted", "int64")

# Additional constraints (optional, but good practice)
validator.expect_column_values_to_not_be_null("product_name")
validator.expect_column_values_to_be_between("rating", min_value=0, max_value=5)
validator.expect_column_values_to_be_between("discount_percent", min_value=0, max_value=100)
validator.expect_column_values_to_be_in_set("is_discounted", [0, 1])

# -------------------------
# 5. Validate the dataset
# -------------------------
results = validator.validate()

# Print a summary of validation results
print("Validation Results:")
print(results)


INFO:great_expectations.data_context.types.base:Created temporary directory '/tmp/tmp_uhn5gse' for ephemeral docs site


AttributeError: 'EphemeralDataContext' object has no attribute 'sources'

In [None]:
import pandas as pd
import great_expectations as gx
from great_expectations.core.expectation_suite import ExpectationSuite
from great_expectations.validator.validator import Validator
from great_expectations.execution_engine import PandasExecutionEngine
from great_expectations.core.batch import Batch, BatchData, BatchSpec

# -------------------------
# 1. Load dataset
# -------------------------
df = pd.read_csv("HaldiramCleanedDataset.csv")   # <-- your uploaded dataset

# -------------------------
# 2. Setup Execution Engine & Expectation Suite
# -------------------------
execution_engine = PandasExecutionEngine()
suite = ExpectationSuite(expectation_suite_name="product_schema_suite")

# Wrap DataFrame as BatchData
batch_data = execution_engine.get_batch_data(batch_spec=BatchSpec({"dataset": df}))

# Create a Batch directly
batch = Batch(data=batch_data)

# Create Validator
validator = Validator(
    execution_engine=execution_engine,
    batches=[batch],
    expectation_suite=suite
)

# -------------------------
# 3. Define schema expectations
# -------------------------
validator.expect_table_columns_to_match_ordered_list(
    column_list=[
        "product_name", "category", "rating", "sales", "price_whole",
        "per_count_price", "mrp", "discount", "review_text",
        "number_of_global_ratings", "number_of_reviews", "product_weight",
        "discount_percent", "is_discounted"
    ]
)

validator.expect_column_values_to_be_of_type("product_name", "object")
validator.expect_column_values_to_be_of_type("category", "object")
validator.expect_column_values_to_be_of_type("rating", "float64")
validator.expect_column_values_to_be_of_type("sales", "object")
validator.expect_column_values_to_be_of_type("price_whole", "int64")
validator.expect_column_values_to_be_of_type("per_count_price", "int64")
validator.expect_column_values_to_be_of_type("mrp", "int64")
validator.expect_column_values_to_be_of_type("discount", "object")
validator.expect_column_values_to_be_of_type("review_text", "object")
validator.expect_column_values_to_be_of_type("number_of_global_ratings", "int64")
validator.expect_column_values_to_be_of_type("number_of_reviews", "int64")
validator.expect_column_values_to_be_of_type("product_weight", "object")
validator.expect_column_values_to_be_of_type("discount_percent", "float64")
validator.expect_column_values_to_be_of_type("is_discounted", "int64")

# Extra constraints
validator.expect_column_values_to_not_be_null("product_name")
validator.expect_column_values_to_be_between("rating", min_value=0, max_value=5)
validator.expect_column_values_to_be_between("discount_percent", min_value=0, max_value=100)
validator.expect_column_values_to_be_in_set("is_discounted", [0, 1])

# -------------------------
# 4. Validate
# -------------------------
results = validator.validate()

print("Validation Results:")
print(results)


TypeError: ExpectationSuite.__init__() got an unexpected keyword argument 'expectation_suite_name'

In [None]:
import pandas as pd
import re

# --- 1. Load the original dataset ---
df = pd.read_csv('HaldiramCleanedDataset.csv')

# --- 2. product_name cleaning ---
df['product_name'] = df['product_name'].apply(lambda x: re.sub(r'[^\w\s,-]', '', x) if isinstance(x, str) else x)

# --- 3. sales cleaning and conversion ---
sales_mapping = {
    '100+ bought in past month': 100,
    '50+ bought in past month': 50,
    'Less than 50 bought in past month': 25,
    'Bestseller': 200,
    'New Arrival': 10
}
df['sales_numeric'] = df['sales'].map(sales_mapping).fillna(0).astype(int)

# --- 4. discount cleaning and conversion ---
df['discount_percentage_cleaned'] = df['discount'].str.extract(r'(\d+)').astype(float).fillna(0).astype(int)

# --- 5. product_weight cleaning and conversion ---
def convert_weight_to_grams(weight):
    if isinstance(weight, str):
        weight = weight.lower()
        if 'kg' in weight:
            # Extract numerical part and convert kg to g
            return float(re.findall(r'[\d\.]+', weight)[0]) * 1000
        elif 'g' in weight:
            # Extract numerical part for g
            return float(re.findall(r'[\d\.]+', weight)[0])
    return None # Return None for non-string values or if no unit found

df['product_weight_grams'] = df['product_weight'].apply(convert_weight_to_grams).fillna(0).astype(int)

# --- 6. review_text cleaning ---
df['review_text_cleaned'] = df['review_text'].str.replace('Read more', '', regex=False).str.strip()


# --- 7. Drop old and unnecessary columns ---
columns_to_drop = [
    'sales',
    'discount',
    'product_weight',
    'review_text',
    'discount_percent',
    'is_discounted'
]
# We'll drop columns only if they exist in the dataframe
columns_that_exist_to_drop = [col for col in columns_to_drop if col in df.columns]
df.drop(columns=columns_that_exist_to_drop, inplace=True)


# --- 8. Save the final cleaned dataset ---
df.to_csv('Haldiram_end_to_end_cleaned.csv', index=False)

print("Full data cleaning and transformation complete!")
print("Cleaned data saved to 'Haldiram_end_to_end_cleaned.csv'")

Full data cleaning and transformation complete!
Cleaned data saved to 'Haldiram_end_to_end_cleaned.csv'


In [None]:
import pandas as pd
import re
from textblob import TextBlob

# --- 1. Load the dataset ---
df = pd.read_csv('Haldirams_Final_cleaned - Haldirams_Final_cleaned.csv')

# --- 2. Remove emojis from the review column ---
def remove_emojis(text):
    if not isinstance(text, str):
        return ''
    # Regex to remove most emojis
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

# Apply the function to the review column
df['review_text_no_emoji'] = df['review_text_cleaned'].apply(remove_emojis)


# --- 3. Perform Sentiment Analysis with TextBlob ---
def get_sentiment_textblob(text):
    if not isinstance(text, str) or text.strip() == "":
        return 'neutral' # Return neutral for empty or non-string reviews

    # Create a TextBlob object
    analysis = TextBlob(text)

    # Classify sentiment based on polarity
    if analysis.sentiment.polarity > 0:
        return 'positive'
    elif analysis.sentiment.polarity < 0:
        return 'negative'
    else:
        return 'neutral'

# Apply the sentiment analysis function
df['sentiment'] = df['review_text_no_emoji'].apply(get_sentiment_textblob)


# --- 4. Save the new dataframe ---
df_final = df.drop(columns=['review_text_no_emoji'])
df_final.to_csv('Haldirams_with_sentiment.csv', index=False)


print("Emoji removal and sentiment analysis complete.")
print("New 'sentiment' column has been added.")

Emoji removal and sentiment analysis complete.
New 'sentiment' column has been added.


In [None]:
import pandas as pd
import re

# Load the CSV file
file_path = "Haldirams_reclassified_entire_dataset.csv"
df = pd.read_csv(file_path)

# Function to remove emojis
def remove_emojis(text):
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags
        u"\U00002702-\U000027B0"  # misc symbols
        u"\U000024C2-\U0001F251"
        "]+",
        flags=re.UNICODE,
    )
    return emoji_pattern.sub(r'', str(text))

# Apply emoji removal to the review_text_cleaned column
df['review_text_cleaned'] = df['review_text_cleaned'].apply(remove_emojis)

# Save the cleaned dataframe to a new CSV
output_path = "Haldirams_cleaned_reviews.csv"
df.to_csv(output_path, index=False)

print(f"Cleaned file saved as: {output_path}")

Cleaned file saved as: Haldirams_cleaned_reviews.csv


Website data cleaning - SMA experiment 3

In [None]:
import pandas as pd
import re

# --- 1. Load the original dataset ---
df = pd.read_csv('Website_dataset.csv')

# --- 2. product_name cleaning ---
df['product_name'] = df['product_name'].apply(lambda x: re.sub(r'[^\w\s,-]', '', x) if isinstance(x, str) else x)

# --- 3. sales cleaning and conversion ---
sales_mapping = {
    '100+ bought in past month': 100,
    '50+ bought in past month': 50,
    'Less than 50 bought in past month': 25,
    'Bestseller': 200,
    'New Arrival': 10
}
df['sales_numeric'] = df['sales'].map(sales_mapping).fillna(0).astype(int)

# --- 4. discount cleaning and conversion ---
df['discount_percentage_cleaned'] = df['discount'].str.extract(r'(\d+)').astype(float).fillna(0).astype(int)

# --- 5. product_weight cleaning and conversion ---
def convert_weight_to_grams(weight):
    if isinstance(weight, str):
        weight = weight.lower()
        if 'kg' in weight:
            # Extract numerical part and convert kg to g
            return float(re.findall(r'[\d\.]+', weight)[0]) * 1000
        elif 'g' in weight:
            # Extract numerical part for g
            return float(re.findall(r'[\d\.]+', weight)[0])
    return None # Return None for non-string values or if no unit found

df['product_weight_grams'] = df['product_weight'].apply(convert_weight_to_grams).fillna(0).astype(int)

# --- 6. review_text cleaning ---
df['review_text_cleaned'] = df['review_text'].str.replace('Read more', '', regex=False).str.strip()


# --- 7. Drop old and unnecessary columns ---
columns_to_drop = [
    'sales',
    'discount',
    'product_weight',
    'review_text',
    'discount_percent',
    'is_discounted'
]
# We'll drop columns only if they exist in the dataframe
columns_that_exist_to_drop = [col for col in columns_to_drop if col in df.columns]
df.drop(columns=columns_that_exist_to_drop, inplace=True)


# --- 8. Save the final cleaned dataset ---
df.to_csv('Haldiram_end_to_end_cleaned.csv', index=False)

print("Full data cleaning and transformation complete!")
print("Cleaned data saved to 'Haldiram_end_to_end_cleaned.csv'")

Full data cleaning and transformation complete!
Cleaned data saved to 'Haldiram_end_to_end_cleaned.csv'


In [None]:
import pandas as pd
import re

# --- 1. Load the dataset ---
df = pd.read_csv('Haldiram_end_to_end_cleaned.csv')


# --- 2. Remove emojis from the review column ---
def remove_emojis(text):
    if not isinstance(text, str):
        return ''
    # Regex to remove most emojis
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

# Apply the function to the review column
df['review_text_no_emoji'] = df['review_text_cleaned'].apply(remove_emojis)

# --- 3. Drop temporary columns if needed ---
df_final = df.drop(columns=['review_text_no_emoji'])

# --- 4. Save the cleaned dataframe ---
df_final.to_csv('Haldirams_cleaned_no_sentiment.csv', index=False)

print("Emoji removal complete. No sentiment column added.")

Emoji removal complete. No sentiment column added.


In [None]:
import pandas as pd
import re

# Load the CSV file
file_path = "Haldirams_reclassified_entire_dataset.csv"
df = pd.read_csv(file_path)

# Function to remove emojis
def remove_emojis(text):
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags
        u"\U00002702-\U000027B0"  # misc symbols
        u"\U000024C2-\U0001F251"
        "]+",
        flags=re.UNICODE,
    )
    return emoji_pattern.sub(r'', str(text))

# Apply emoji removal to the review_text_cleaned column
df['review_text_cleaned'] = df['review_text_cleaned'].apply(remove_emojis)

# Save the cleaned dataframe to a new CSV
output_path = "Haldirams_cleaned_reviews.csv"
df.to_csv(output_path, index=False)

print(f"Cleaned file saved as: {output_path}")

In [None]:
import pandas as pd
import re

# --- 1. Load the CSV file ---
file_path = "Haldirams_cleaned_no_sentiment.csv"
df = pd.read_csv(file_path)

# --- 2. Function to remove emojis ---
def remove_emojis(text):
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags
        u"\U00002702-\U000027B0"  # misc symbols
        u"\U000024C2-\U0001F251"
        "]+",
        flags=re.UNICODE,
    )
    return emoji_pattern.sub(r'', str(text))

# --- 3. Function to remove unwanted symbols from text ---
def remove_symbols(text):
    if not isinstance(text, str):
        return ""
    # Keep only letters, numbers, and spaces
    return re.sub(r'[^A-Za-z0-9\s]', '', text)

# --- 4. Clean review_text_cleaned column ---
df['review_text_cleaned'] = df['review_text_cleaned'].apply(remove_emojis)
df['review_text_cleaned'] = df['review_text_cleaned'].apply(remove_symbols)

# --- 5. Clean per_count_price and mrp columns ---
for col in ['per_count_price', 'mrp']:
    if col in df.columns:
        # Remove 'M.R.P: ₹' and '₹', keep only numbers
        df[col] = df[col].astype(str).str.replace("M.R.P: ₹", "", regex=False)
        df[col] = df[col].str.replace("₹", "", regex=False)
        df[col] = df[col].str.replace(r"[^\d\.]", "", regex=True)  # remove any leftover non-numeric symbols

# --- 6. Save the cleaned dataset ---
output_path = "Haldirams_cleaned_reviews.csv"
df.to_csv(output_path, index=False)

print(f"Cleaned file saved as: {output_path}")

Cleaned file saved as: Haldirams_cleaned_reviews.csv


In [None]:
import pandas as pd

# Load the CSV
df = pd.read_csv("Haldirams_cleaned_reviews.csv")

# Remove commas from 'price_whole' and convert to numeric
df['price_whole'] = df['price_whole'].astype(str).str.replace(',', '')
df['price_whole'] = pd.to_numeric(df['price_whole'], errors='coerce')  # converts to numbers, NaN if invalid

# Save back to CSV if needed
df.to_csv("Haldiram_cleaned_reviews.csv", index=False)