# Project Setup: customer-segmentation
This notebook initializes the analysis environment and downloads required data.
Run all cells to set up your analysis environment.

In [1]:
# Essential imports
import os
import sys
import shutil
import pandas as pd
import numpy as np
from pathlib import Path
import kagglehub
from IPython.display import display

print("✓ Imports completed successfully")

✓ Imports completed successfully


In [2]:
# Project Configuration - These variables are shared across all notebooks
SLUG = 'customer-segmentation'
DATASET_KEY = 'vjchoudhary7/customer-segmentation-tutorial-in-python'

# Directory paths
GIT_ROOT = Path('/Users/ravisharma/workdir/eda_practice')
DATA_DIR = GIT_ROOT / 'data' / SLUG
FIG_DIR = GIT_ROOT / 'figures' / SLUG
REP_DIR = GIT_ROOT / 'reports' / SLUG
NOTEBOOK_DIR = GIT_ROOT / 'notebooks' / SLUG

# Make variables available to other notebooks in this folder
%store SLUG
%store DATA_DIR
%store FIG_DIR
%store REP_DIR
%store NOTEBOOK_DIR
%store DATASET_KEY

print(f"Project: {SLUG}")
print(f"Data Directory: {DATA_DIR}")
print(f"Figures Directory: {FIG_DIR}")
print(f"Reports Directory: {REP_DIR}")
print("\n✓ Configuration variables set and stored")

Stored 'SLUG' (str)
Stored 'DATA_DIR' (PosixPath)
Stored 'FIG_DIR' (PosixPath)
Stored 'REP_DIR' (PosixPath)
Stored 'NOTEBOOK_DIR' (PosixPath)
Stored 'DATASET_KEY' (str)
Project: customer-segmentation
Data Directory: /Users/ravisharma/workdir/eda_practice/data/customer-segmentation
Figures Directory: /Users/ravisharma/workdir/eda_practice/figures/customer-segmentation
Reports Directory: /Users/ravisharma/workdir/eda_practice/reports/customer-segmentation

✓ Configuration variables set and stored


In [5]:
# Load stored variables
%store -r SLUG
%store -r DATA_DIR
%store -r FIG_DIR
%store -r REP_DIR
%store -r NOTEBOOK_DIR
%store -r DATASET_KEY

# Verify variables were loaded correctly
print("Retrieved variables:")
print(f"SLUG = {SLUG}")
print(f"DATA_DIR = {DATA_DIR}")
print(f"DATASET_KEY = {DATASET_KEY}")
print(f"FIG_DIR = {FIG_DIR}")
print(f"REP_DIR = {REP_DIR}")
print(f"NOTEBOOK_DIR = {NOTEBOOK_DIR}")

Retrieved variables:
SLUG = customer-segmentation
DATA_DIR = /Users/ravisharma/workdir/eda_practice/data/customer-segmentation
DATASET_KEY = vjchoudhary7/customer-segmentation-tutorial-in-python
FIG_DIR = /Users/ravisharma/workdir/eda_practice/figures/customer-segmentation
REP_DIR = /Users/ravisharma/workdir/eda_practice/reports/customer-segmentation
NOTEBOOK_DIR = /Users/ravisharma/workdir/eda_practice/notebooks/customer-segmentation


In [4]:
%store -r

In [4]:
# Download dataset from Kaggle
print(f"Downloading dataset: {DATASET_KEY}")
print("This may take a few minutes depending on dataset size...")

try:
    download_path = kagglehub.dataset_download(DATASET_KEY)
    print(f"✓ Dataset downloaded successfully to: {download_path}")
    DOWNLOAD_PATH = Path(download_path)
    %store DOWNLOAD_PATH
except Exception as e:
    print(f"❌ Error downloading dataset: {e}")
    print("Please check your Kaggle API credentials and dataset key.")
    raise

Downloading dataset: vjchoudhary7/customer-segmentation-tutorial-in-python
This may take a few minutes depending on dataset size...
Downloading from https://www.kaggle.com/api/v1/datasets/download/vjchoudhary7/customer-segmentation-tutorial-in-python?dataset_version_number=1...
Downloading from https://www.kaggle.com/api/v1/datasets/download/vjchoudhary7/customer-segmentation-tutorial-in-python?dataset_version_number=1...


100%|██████████| 1.55k/1.55k [00:00<00:00, 1.40MB/s]

Extracting files...
✓ Dataset downloaded successfully to: /Users/ravisharma/.cache/kagglehub/datasets/vjchoudhary7/customer-segmentation-tutorial-in-python/versions/1
Stored 'DOWNLOAD_PATH' (PosixPath)
✓ Dataset downloaded successfully to: /Users/ravisharma/.cache/kagglehub/datasets/vjchoudhary7/customer-segmentation-tutorial-in-python/versions/1
Stored 'DOWNLOAD_PATH' (PosixPath)



  db[ 'autorestore/' + arg ] = obj


In [5]:
# Copy data to project directory
print(f"Copying data from {DOWNLOAD_PATH} to {DATA_DIR}")

def copy_data_files(source_dir, target_dir):
    """Copy all files from source to target directory."""
    source_path = Path(source_dir)
    target_path = Path(target_dir)
    
    # Ensure target directory exists
    target_path.mkdir(parents=True, exist_ok=True)
    
    copied_files = []
    
    for file_path in source_path.rglob('*'):
        if file_path.is_file():
            # Maintain relative directory structure
            relative_path = file_path.relative_to(source_path)
            target_file_path = target_path / relative_path
            
            # Create parent directories if needed
            target_file_path.parent.mkdir(parents=True, exist_ok=True)
            
            # Copy file
            shutil.copy2(file_path, target_file_path)
            copied_files.append(target_file_path)
            print(f"  ✓ Copied: {relative_path}")
    
    return copied_files

try:
    copied_files = copy_data_files(DOWNLOAD_PATH, DATA_DIR)
    print(f"\n✓ Successfully copied {len(copied_files)} files to {DATA_DIR}")
except Exception as e:
    print(f"❌ Error copying files: {e}")
    raise

Copying data from /Users/ravisharma/.cache/kagglehub/datasets/vjchoudhary7/customer-segmentation-tutorial-in-python/versions/1 to /Users/ravisharma/workdir/eda_practice/data/customer-segmentation
  ✓ Copied: Mall_Customers.csv

✓ Successfully copied 1 files to /Users/ravisharma/workdir/eda_practice/data/customer-segmentation


In [6]:
# Verify data integrity and provide summary
print("Verifying data integrity...")

def verify_data_integrity(data_dir):
    """Verify that data was copied correctly and provide summary."""
    data_path = Path(data_dir)
    
    if not data_path.exists():
        print(f"❌ Data directory does not exist: {data_path}")
        return False
    
    # Get all files
    all_files = list(data_path.rglob('*'))
    data_files = [f for f in all_files if f.is_file()]
    
    if not data_files:
        print(f"❌ No files found in data directory: {data_path}")
        return False
    
    print(f"\n📊 Data Summary for {SLUG}:")
    print("="*50)
    print(f"Total files: {len(data_files)}")
    
    # Analyze file types
    file_types = {}
    total_size = 0
    
    for file_path in data_files:
        file_ext = file_path.suffix.lower()
        file_size = file_path.stat().st_size
        
        if file_ext not in file_types:
            file_types[file_ext] = {'count': 0, 'size': 0}
        
        file_types[file_ext]['count'] += 1
        file_types[file_ext]['size'] += file_size
        total_size += file_size
    
    print(f"Total size: {total_size / (1024**2):.2f} MB")
    print("\nFile types:")
    for ext, info in file_types.items():
        ext_name = ext if ext else 'no extension'
        size_mb = info['size'] / (1024**2)
        print(f"  {ext_name}: {info['count']} files ({size_mb:.2f} MB)")
    
    # Try to load CSV files for basic validation
    csv_files = [f for f in data_files if f.suffix.lower() == '.csv']
    if csv_files:
        print("\n📋 CSV File Preview:")
        for csv_file in csv_files[:3]:  # Preview first 3 CSV files
            try:
                df = pd.read_csv(csv_file, nrows=5)  # Read only first 5 rows
                print(f"\n{csv_file.name}:")
                print(f"  Shape: {df.shape} (showing first 5 rows)")
                print(f"  Columns: {list(df.columns)}")
                display(df.head())
            except Exception as e:
                print(f"  ⚠️ Could not preview {csv_file.name}: {e}")
    
    print("\n✅ Data verification completed successfully!")
    return True

# Run verification
verification_success = verify_data_integrity(DATA_DIR)

if verification_success:
    print(f"\n🎉 Setup completed successfully for project: {SLUG}")
    print("\nNext steps:")
    print(f"1. Create new notebooks in: {NOTEBOOK_DIR}")
    print("2. Load shared variables with: %store -r")
    print("3. Start your analysis!")
else:
    print("\n❌ Setup verification failed. Please check the errors above.")

Verifying data integrity...

📊 Data Summary for customer-segmentation:
Total files: 1
Total size: 0.00 MB

File types:
  .csv: 1 files (0.00 MB)

📋 CSV File Preview:

Mall_Customers.csv:
  Shape: (5, 5) (showing first 5 rows)
  Columns: ['CustomerID', 'Gender', 'Age', 'Annual Income (k$)', 'Spending Score (1-100)']


Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40



✅ Data verification completed successfully!

🎉 Setup completed successfully for project: customer-segmentation

Next steps:
1. Create new notebooks in: /Users/ravisharma/workdir/eda_practice/notebooks/customer-segmentation
2. Load shared variables with: %store -r
3. Start your analysis!
