# Workspace Setup Notebook

This notebook helps set up a new machine learning workspace environment with proper configuration and dependencies.

## Table of Contents
1. [Import Required Libraries](#import-libraries)
2. [Set Working Directory](#working-directory)
3. [Create Project Folder Structure](#folder-structure)
4. [Install Required Packages](#install-packages)
5. [Configure Environment Variables](#environment-variables)
6. [Set Up Version Control](#version-control)
7. [Create Configuration Files](#config-files)
8. [Verify Installation and Setup](#verify-setup)

## 1. Import Required Libraries {#import-libraries}

Import essential libraries for file operations, environment management, and system configuration.

In [None]:
import os
import sys
import subprocess
import platform
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set up plotting style
plt.style.use('default')
sns.set_palette("husl")

print("✅ Libraries imported successfully!")
print(f"Python version: {sys.version}")
print(f"Operating System: {platform.system()} {platform.release()}")
print(f"Current working directory: {os.getcwd()}")

## 2. Set Working Directory {#working-directory}

Configure and verify the current working directory for the project workspace.

In [None]:
# Get the project root directory (parent of notebooks folder)
project_root = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
os.chdir(project_root)

print(f"✅ Working directory set to: {project_root}")
print(f"Project structure:")
for item in sorted(project_root.iterdir()):
    if item.is_dir():
        print(f"📁 {item.name}/")
    else:
        print(f"📄 {item.name}")

## 3. Create Project Folder Structure {#folder-structure}

Create a standardized directory structure for organizing project files, data, notebooks, and outputs.

In [None]:
# Define the directory structure
directories = [
    'data/raw',
    'data/processed', 
    'data/external',
    'models',
    'notebooks/exploratory',
    'notebooks/modeling',
    'notebooks/evaluation',
    'src/data',
    'src/models',
    'src/features',
    'src/utils',
    'tests',
    'docs'
]

# Create directories
for directory in directories:
    dir_path = Path(directory)
    dir_path.mkdir(parents=True, exist_ok=True)
    print(f"✅ Created: {directory}")

# Create __init__.py files for Python packages
init_files = [
    'src/__init__.py',
    'src/data/__init__.py',
    'src/models/__init__.py', 
    'src/features/__init__.py',
    'src/utils/__init__.py',
    'tests/__init__.py'
]

for init_file in init_files:
    init_path = Path(init_file)
    if not init_path.exists():
        init_path.touch()
        print(f"✅ Created: {init_file}")

print("\n🎉 Project structure created successfully!")

## 4. Install Required Packages {#install-packages}

Install and update necessary Python packages using pip package manager.

In [None]:
# Check if requirements.txt exists
requirements_file = Path('requirements.txt')

if requirements_file.exists():
    print("📦 Installing packages from requirements.txt...")
    try:
        result = subprocess.run([sys.executable, '-m', 'pip', 'install', '-r', 'requirements.txt'], 
                               capture_output=True, text=True, check=True)
        print("✅ Packages installed successfully!")
        print(result.stdout)
    except subprocess.CalledProcessError as e:
        print(f"❌ Error installing packages: {e}")
        print(f"Error output: {e.stderr}")
else:
    print("⚠️ requirements.txt not found. Installing essential packages manually...")
    
    essential_packages = [
        'pandas>=2.0.0',
        'numpy>=1.24.0', 
        'scikit-learn>=1.3.0',
        'matplotlib>=3.7.0',
        'seaborn>=0.12.0',
        'jupyter>=1.0.0'
    ]
    
    for package in essential_packages:
        try:
            subprocess.run([sys.executable, '-m', 'pip', 'install', package], 
                          capture_output=True, text=True, check=True)
            print(f"✅ Installed: {package}")
        except subprocess.CalledProcessError:
            print(f"❌ Failed to install: {package}")

print("\n🔍 Checking installed packages...")
try:
    result = subprocess.run([sys.executable, '-m', 'pip', 'list'], 
                           capture_output=True, text=True, check=True)
    print("Installed packages:")
    for line in result.stdout.split('\\n')[:10]:  # Show first 10 packages
        if line.strip():
            print(f"  {line}")
    print("  ... (and more)")
except subprocess.CalledProcessError:
    print("❌ Could not list installed packages")

## 5. Configure Environment Variables {#environment-variables}

Set up environment variables for API keys, paths, and configuration settings.

In [None]:
# Create a sample .env file template
env_template = """# Environment Variables for ML Project
# Copy this file to .env and fill in your actual values

# Project Configuration
PROJECT_NAME=ml-project
PROJECT_VERSION=0.1.0
ENVIRONMENT=development

# Data Paths
DATA_PATH=./data
MODEL_PATH=./models
NOTEBOOK_PATH=./notebooks

# API Keys (replace with actual keys)
# OPENAI_API_KEY=your_openai_key_here
# HUGGINGFACE_API_KEY=your_huggingface_key_here
# WANDB_API_KEY=your_wandb_key_here

# Database Configuration (if needed)
# DATABASE_URL=postgresql://user:password@localhost:5432/mlproject

# Model Configuration
RANDOM_SEED=42
TEST_SIZE=0.2
CROSS_VALIDATION_FOLDS=5

# Logging
LOG_LEVEL=INFO
LOG_FILE=./logs/ml_project.log
"""

# Write the template
env_template_path = Path('.env.template')
with open(env_template_path, 'w') as f:
    f.write(env_template)

print(f"✅ Created environment template: {env_template_path}")

# Set some basic environment variables for this session
os.environ['PROJECT_ROOT'] = str(Path.cwd())
os.environ['PYTHONPATH'] = str(Path.cwd() / 'src')
os.environ['RANDOM_SEED'] = '42'

print("✅ Set basic environment variables:")
print(f"  PROJECT_ROOT: {os.environ.get('PROJECT_ROOT')}")
print(f"  PYTHONPATH: {os.environ.get('PYTHONPATH')}")
print(f"  RANDOM_SEED: {os.environ.get('RANDOM_SEED')}")

print("\n📝 Next steps:")
print("1. Copy .env.template to .env")
print("2. Fill in your actual API keys and configuration values")
print("3. Load the .env file in your scripts using python-dotenv")

## 6. Set Up Version Control {#version-control}

Initialize Git repository and create essential configuration files like .gitignore.

In [None]:
# Check if Git is already initialized
git_dir = Path('.git')

if not git_dir.exists():
    try:
        # Initialize Git repository
        result = subprocess.run(['git', 'init'], capture_output=True, text=True, check=True)
        print("✅ Git repository initialized!")
        print(result.stdout)
        
        # Set up basic Git configuration (optional)
        try:
            subprocess.run(['git', 'config', 'user.name', 'ML Developer'], 
                          capture_output=True, text=True, check=True)
            subprocess.run(['git', 'config', 'user.email', 'developer@example.com'], 
                          capture_output=True, text=True, check=True)
            print("✅ Basic Git configuration set!")
        except subprocess.CalledProcessError:
            print("⚠️ Could not set Git configuration (you can do this manually)")
            
    except subprocess.CalledProcessError as e:
        print(f"❌ Error initializing Git: {e}")
        print("Make sure Git is installed on your system")
else:
    print("✅ Git repository already exists!")

# Check if .gitignore exists and create/update it
gitignore_path = Path('.gitignore')
if gitignore_path.exists():
    print("✅ .gitignore file already exists!")
else:
    print("⚠️ .gitignore file not found")
    print("Create one manually or it should have been created during setup")

# Show Git status
try:
    result = subprocess.run(['git', 'status'], capture_output=True, text=True, check=True)
    print(f"\n📊 Git Status:")
    print(result.stdout)
except subprocess.CalledProcessError:
    print("⚠️ Could not check Git status")

## 7. Create Configuration Files {#config-files}

Generate configuration files for the project including requirements.txt and setup scripts.

In [None]:
# Check existing configuration files
config_files = ['requirements.txt', 'setup.py', 'README.md', '.gitignore']

print("📋 Configuration files status:")
for file in config_files:
    file_path = Path(file)
    if file_path.exists():
        print(f"✅ {file} - exists")
        if file == 'requirements.txt':
            # Show content preview
            with open(file_path, 'r') as f:
                lines = f.readlines()[:5]  # First 5 lines
                print(f"   Preview: {', '.join([line.strip() for line in lines if line.strip()])}")
    else:
        print(f"❌ {file} - missing")

# Create a project configuration file
config_content = {
    "project": {
        "name": "ml-project",
        "version": "0.1.0",
        "description": "A machine learning project template"
    },
    "paths": {
        "data": "./data",
        "models": "./models",
        "notebooks": "./notebooks",
        "src": "./src"
    },
    "settings": {
        "random_seed": 42,
        "test_size": 0.2,
        "cv_folds": 5
    }
}

import json
config_path = Path('config.json')
with open(config_path, 'w') as f:
    json.dump(config_content, f, indent=2)

print(f"\\n✅ Created project configuration: {config_path}")

# Create a simple Makefile for common tasks
makefile_content = """# Makefile for ML Project

.PHONY: install test clean lint format

# Install dependencies
install:
\tpip install -r requirements.txt
\tpip install -e .

# Run tests
test:
\tpytest tests/ -v

# Clean cache and temporary files
clean:
\tfind . -type d -name "__pycache__" -delete
\tfind . -type f -name "*.pyc" -delete
\tfind . -type d -name "*.egg-info" -exec rm -rf {} +

# Lint code
lint:
\tflake8 src/ tests/

# Format code
format:
\tblack src/ tests/

# Setup development environment
setup: install
\tpre-commit install

# Run jupyter lab
notebook:
\tjupyter lab
"""

makefile_path = Path('Makefile')
with open(makefile_path, 'w') as f:
    f.write(makefile_content)

print(f"✅ Created Makefile: {makefile_path}")
print("\\n🔧 Available commands:")
print("  make install  - Install dependencies")
print("  make test     - Run tests")
print("  make clean    - Clean cache files")
print("  make notebook - Start Jupyter Lab")

## 8. Verify Installation and Setup {#verify-setup}

Run diagnostic checks to ensure all components are properly installed and configured.

In [None]:
# Comprehensive setup verification
print("🔍 WORKSPACE SETUP VERIFICATION")
print("=" * 50)

# 1. Check Python environment
print(f"\\n1. Python Environment:")
print(f"   ✅ Python version: {sys.version.split()[0]}")
print(f"   ✅ Python executable: {sys.executable}")

# 2. Check essential libraries
essential_libs = ['pandas', 'numpy', 'sklearn', 'matplotlib', 'seaborn']
print(f"\\n2. Essential Libraries:")

for lib in essential_libs:
    try:
        __import__(lib)
        print(f"   ✅ {lib} - installed")
    except ImportError:
        print(f"   ❌ {lib} - missing")

# 3. Check project structure
print(f"\\n3. Project Structure:")
required_dirs = ['src', 'data', 'models', 'notebooks', 'tests']
for directory in required_dirs:
    dir_path = Path(directory)
    if dir_path.exists():
        print(f"   ✅ {directory}/ - exists")
    else:
        print(f"   ❌ {directory}/ - missing")

# 4. Check configuration files
print(f"\\n4. Configuration Files:")
config_files = ['requirements.txt', 'setup.py', 'README.md', '.gitignore']
for file in config_files:
    file_path = Path(file)
    if file_path.exists():
        print(f"   ✅ {file} - exists")
    else:
        print(f"   ❌ {file} - missing")

# 5. Check Git setup
print(f"\\n5. Version Control:")
try:
    result = subprocess.run(['git', '--version'], capture_output=True, text=True, check=True)
    print(f"   ✅ Git installed: {result.stdout.strip()}")
    
    if Path('.git').exists():
        print(f"   ✅ Git repository initialized")
    else:
        print(f"   ❌ Git repository not initialized")
except:
    print(f"   ❌ Git not available")

# 6. Test basic functionality
print(f"\\n6. Basic Functionality Test:")
try:
    # Test pandas
    test_df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
    print(f"   ✅ Pandas DataFrame creation - working")
    
    # Test numpy
    test_array = np.array([1, 2, 3])
    print(f"   ✅ NumPy array creation - working")
    
    # Test matplotlib
    fig, ax = plt.subplots(1, 1, figsize=(1, 1))
    plt.close(fig)
    print(f"   ✅ Matplotlib plotting - working")
    
    # Test scikit-learn
    from sklearn.linear_model import LinearRegression
    model = LinearRegression()
    print(f"   ✅ Scikit-learn model import - working")
    
except Exception as e:
    print(f"   ❌ Error in functionality test: {e}")

print(f"\\n🎉 SETUP VERIFICATION COMPLETE!")
print("=" * 50)
print("\\n📝 Next Steps:")
print("1. Start exploring data in notebooks/exploratory/")
print("2. Create your first model in notebooks/modeling/")
print("3. Add your source code to src/")
print("4. Write tests in tests/")
print("5. Update README.md with project details")

# Save verification results
verification_time = pd.Timestamp.now()
print(f"\\n✅ Verification completed at: {verification_time}")
print(f"✅ Workspace ready for machine learning development!")