In [2]:
# Google Colab: Upload CSV file
from google.colab import files
import pandas as pd

# Step 0: Upload the CSV File
uploaded = files.upload()  # Choose 'customer_shopping_data.csv' from your system

# Step 1: Load the Dataset
df = pd.read_csv(next(iter(uploaded)))  # Automatically gets the uploaded file
print("Original Dataset:")
print(df.head())

# Track initial stats
initial_shape = df.shape
nulls_before = df.isnull().sum().sum()
duplicates_before = df.duplicated().sum()

# Step 2: Handle Missing Values
df.ffill(inplace=True)
nulls_after = df.isnull().sum().sum()
nulls_filled = nulls_before - nulls_after

# Step 3: Remove Duplicate Rows
df.drop_duplicates(inplace=True)
duplicates_removed = duplicates_before

# Step 4: Standardize Text Values (example: gender, category)
if 'Gender' in df.columns:
    df['Gender'] = df['Gender'].str.lower().str.strip()
if 'Category' in df.columns:
    df['Category'] = df['Category'].str.lower().str.strip()

# Step 5: Convert Date Formats
for col in df.columns:
    if 'date' in col.lower():
        df[col] = pd.to_datetime(df[col], errors='coerce')

# Step 6: Rename Column Headers
df.columns = df.columns.str.lower().str.replace(' ', '_')

# Step 7: Check and Fix Data Types
for col in ['age', 'quantity', 'price']:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)

# Step 8: Save Cleaned Dataset
df.to_csv('cleaned_customer_shopping_data.csv', index=False)

# Optional: Download cleaned file
files.download('cleaned_customer_shopping_data.csv')

# Step 9: Final Summary
final_shape = df.shape

print("\nCleaned Dataset Preview:")
print(df.head())

print("\nSummary of Changes:")
print(f"- Original shape: {initial_shape}")
print(f"- Final shape: {final_shape}")
print(f"- Missing values filled: {nulls_filled}")
print(f"- Duplicate rows removed: {duplicates_removed}")
print(f"- Column names standardized (lowercase, no spaces)")

if 'gender' in df.columns:
    print(f"- 'gender' column standardized (lowercased & stripped)")
if 'category' in df.columns:
    print(f"- 'category' column standardized (lowercased & stripped)")
for col in df.columns:
    if 'date' in col:
        print(f"- '{col}' column converted to datetime format")
for col in ['age', 'quantity', 'price']:
    if col in df.columns:
        print(f"- '{col}' column converted to integer type")


Saving customer_shopping_data.csv to customer_shopping_data.csv
Original Dataset:
  invoice_no customer_id  gender  age  category  quantity    price  \
0    I138884     C241288  Female   28  Clothing         5  1500.40   
1    I317333     C111565    Male   21     Shoes         3  1800.51   
2    I127801     C266599    Male   20  Clothing         1   300.08   
3    I173702     C988172  Female   66     Shoes         5  3000.85   
4    I337046     C189076  Female   53     Books         4    60.60   

  payment_method invoice_date   shopping_mall  
0    Credit Card     5/8/2022          Kanyon  
1     Debit Card   12/12/2021  Forum Istanbul  
2           Cash    9/11/2021       Metrocity  
3    Credit Card   16/05/2021    Metropol AVM  
4           Cash   24/10/2021          Kanyon  


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Cleaned Dataset Preview:
  invoice_no customer_id  gender  age  category  quantity  price  \
0    I138884     C241288  Female   28  Clothing         5   1500   
1    I317333     C111565    Male   21     Shoes         3   1800   
2    I127801     C266599    Male   20  Clothing         1    300   
3    I173702     C988172  Female   66     Shoes         5   3000   
4    I337046     C189076  Female   53     Books         4     60   

  payment_method invoice_date   shopping_mall  
0    Credit Card   2022-05-08          Kanyon  
1     Debit Card   2021-12-12  Forum Istanbul  
2           Cash   2021-09-11       Metrocity  
3    Credit Card          NaT    Metropol AVM  
4           Cash          NaT          Kanyon  

Summary of Changes:
- Original shape: (99457, 10)
- Final shape: (99457, 10)
- Missing values filled: 0
- Duplicate rows removed: 0
- Column names standardized (lowercase, no spaces)
- 'gender' column standardized (lowercased & stripped)
- 'category' column standardized (lowe