In [1]:
import pandas as pd
import numpy as np
import re

# Define the filename based on the previous script's output
FILE_NAME = "my_new_scrape_data.csv"

# --- 1. Load scraped data into pandas DataFrame ---
try:
    df = pd.read_csv(FILE_NAME)
    print(f"✅ Successfully loaded {len(df)} rows from {FILE_NAME}")
except FileNotFoundError:
    print(f"❌ Error: File '{FILE_NAME}' not found. Please run the scraping script successfully first.")
    exit()

print("\n--- Initial DataFrame Info ---")
df.info()

# --- 2. Clean price, rating, review counts, and handle missing values ---

## A. Price Cleaning
def clean_price(price):
    if pd.isna(price) or price == "N/A":
        return np.nan
    # Use regex to find and extract the first number (float or int)
    match = re.search(r'[\d\.,]+', str(price).replace(',', ''))
    if match:
        try:
            return float(match.group(0))
        except ValueError:
            return np.nan
    return np.nan

print("\n--- Cleaning Price Column ---")
# Apply cleaning function
df['Price_Clean'] = df['Price'].apply(clean_price)

# Handle cases where price might be a range (e.g., "$10.00 - $15.00").
# Since the regex grabs the first number, it keeps the lower bound, which is reasonable.

## B. Rating & Reviews Cleaning (Handling Placeholders)
# Since the original scraper used "N/A" for Rating and "0" for Reviews, 
# we'll convert these columns to numeric, treating 'N/A' as NaN.
print("--- Cleaning Rating and Reviews Columns ---")

# Replace the "N/A" placeholder for Rating with NaN for proper numeric conversion
df['Rating'] = df['Rating'].replace('N/A', np.nan)
df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')

# Convert Reviews to numeric. 'errors=coerce' turns non-numeric values (if any) into NaN.
df['Reviews'] = pd.to_numeric(df['Reviews'], errors='coerce').fillna(0).astype(int) 

## C. Handling Missing Values
print("--- Handling Missing Values ---")
# Drop rows where Name, Category, or the Cleaned Price are missing, as these are critical features.
initial_count = len(df)
df.dropna(subset=['Name', 'Category', 'Price_Clean'], inplace=True)
print(f"Dropped {initial_count - len(df)} rows with missing critical data.")

# Fill missing Ratings (NaN) with the average rating for that category
df['Rating'].fillna(df.groupby('Category')['Rating'].transform('mean'), inplace=True)

# For any remaining NaNs in 'Rating' (categories with no rated products), fill with 0
df['Rating'].fillna(0, inplace=True) 


# --- 3. Create at least two additional derived features ---

## Feature 1: Word Count in Product Name (Complexity/Specificity proxy)
# A longer name might suggest a more specific or complex product.
print("\n--- Creating Derived Feature 1: Name Word Count ---")
df['Name_Word_Count'] = df['Name'].apply(lambda x: len(str(x).split()))

## Feature 2: Price Rank within Category
# This shows how expensive a product is relative to others in the same category.
print("--- Creating Derived Feature 2: Price Rank within Category ---")
# Use 'rank' to get the relative rank (1 being the lowest price, ascending=True)
df['Price_Category_Rank'] = df.groupby('Category')['Price_Clean'].rank(method='min', ascending=False)


# --- Final Output ---
print("\n--- Final Cleaned & Transformed DataFrame Snapshot ---")
# Select the original columns and the new derived columns for a clean view
final_columns = ['Category', 'Name', 'Price_Clean', 'Rating', 'Reviews', 'Name_Word_Count', 'Price_Category_Rank', 'URL']
print(df[final_columns].head())

print(f"\nFinal row count after cleaning: {len(df)}")
df.info()

# Optional: Save the cleaned data
df[final_columns].to_csv("my_new_scrape_data_CLEANED.csv", index=False)
print("\nCleaned data saved to 'my_new_scrape_data_CLEANED.csv'")

✅ Successfully loaded 91 rows from my_new_scrape_data.csv

--- Initial DataFrame Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91 entries, 0 to 90
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Category  91 non-null     object 
 1   Name      91 non-null     object 
 2   Price     91 non-null     object 
 3   Rating    0 non-null      float64
 4   Reviews   91 non-null     int64  
 5   URL       91 non-null     object 
dtypes: float64(1), int64(1), object(4)
memory usage: 4.4+ KB

--- Cleaning Price Column ---
--- Cleaning Rating and Reviews Columns ---
--- Handling Missing Values ---
Dropped 0 rows with missing critical data.

--- Creating Derived Feature 1: Name Word Count ---
--- Creating Derived Feature 2: Price Rank within Category ---

--- Final Cleaned & Transformed DataFrame Snapshot ---
      Category                                               Name  \
0  Home Garden  BULLCAPTAIN Genuine Leather

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Rating'].fillna(df.groupby('Category')['Rating'].transform('mean'), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Rating'].fillna(0, inplace=True)
