In [None]:
# Install required packages (needs to be run every time in new environments)
!pip install pandas scikit-learn matplotlib seaborn

import pandas as pd
import pickle
import json
import os
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings('ignore')
print("‚úÖ Libraries imported successfully!")

# Global variables (state management issue)
DATA_DIR = "pipeline_data"

In [None]:
import shutil

if os.path.exists(DATA_DIR):
    shutil.rmtree(DATA_DIR)  # Dangerous - could delete important data!
    
os.makedirs(DATA_DIR, exist_ok=True)
print(f"üìÅ Created directory: {DATA_DIR}")

# Manual cleanup function (often forgotten)
def cleanup_temp_files():
    """Manual cleanup - often forgotten or incomplete"""
    try:
        if os.path.exists(DATA_DIR):
            shutil.rmtree(DATA_DIR)
        print("üßπ Temp files cleaned")
    except Exception as e:
        print(f"‚ùå Cleanup failed: {e}")

In [None]:
# Data Preparation - typically hardcoded or inconsistent
def prepare_data():
    """Manual data preparation - no versioning or lineage tracking"""
    print("=== Step 1: Data Preparation ===")
    
    # Hardcoded sample data (not parameterized)
    data = {
        'review': [
            'This product is amazing!',
            'Terrible quality, waste of money',
            'Good value for money', 
            'Poor customer service',
            'Excellent product, highly recommend',
            'Not worth the price',
            'Great experience overall',
            'Disappointed with purchase',
            'Outstanding quality and service',
            'Worst purchase ever made',
            'Decent product for the price',
            'Highly satisfied with quality'
        ],
        'sentiment': [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1]
    }
    
    df = pd.DataFrame(data)
    
    # Manual file saving (no metadata tracking)
    output_path = os.path.join(DATA_DIR, 'raw_data.pkl')
    with open(output_path, 'wb') as f:
        pickle.dump(df, f)
    
    print(f"üìä Processed {len(df)} reviews")
    print("Sample data:")
    display(df.head(3))
    
    return df, output_path

# Execute (must remember to run cells in order!)
df_raw, raw_data_path = prepare_data()

In [None]:
# Feature Engineering - tightly coupled to previous cell
def extract_features(input_path):
    """Feature extraction - manual parameter tuning"""
    print("\n=== Step 2: Feature Engineering ===")
    
    try:
        # Manual file loading (no error handling for missing files)
        with open(input_path, 'rb') as f:
            df = pickle.load(f)
    except FileNotFoundError:
        print("‚ùå Error: Run the data preparation cell first!")
        return None
    
    # Hardcoded hyperparameters (not configurable)
    vectorizer = TfidfVectorizer(
        max_features=100,  # Fixed value
        stop_words='english',  # Not parameterized
        ngram_range=(1, 1)  # Not tunable
    )
    
    X = vectorizer.fit_transform(df['review'])
    y = df['sentiment'].values
    
    # Manual file management
    X_path = os.path.join(DATA_DIR, 'features_X.pkl')
    y_path = os.path.join(DATA_DIR, 'features_y.pkl') 
    vectorizer_path = os.path.join(DATA_DIR, 'vectorizer.pkl')
    
    # Multiple pickle operations (inefficient)
    with open(X_path, 'wb') as f:
        pickle.dump(X, f)
    with open(y_path, 'wb') as f:
        pickle.dump(y, f)
    with open(vectorizer_path, 'wb') as f:
        pickle.dump(vectorizer, f)
    
    print(f"üîß Feature matrix shape: {X.shape}")
    print(f"üìù Vocabulary size: {len(vectorizer.get_feature_names_out())}")
    
    return X, y, vectorizer, X_path, y_path, vectorizer_path

# Execute (order dependency!)
try:
    X, y, vectorizer, X_path, y_path, vectorizer_path = extract_features(raw_data_path)
except NameError:
    print("‚ùå Error: raw_data_path not defined. Run previous cells first!")

In [None]:
# Model Training - no resource limits or monitoring
def train_model(X_path, y_path):
    """Train model - manual hyperparameter management"""
    print("\n=== Step 3: Model Training ===")
    
    # Manual loading (could fail silently)
    try:
        with open(X_path, 'rb') as f:
            X = pickle.load(f)
        with open(y_path, 'rb') as f:
            y = pickle.load(f)
    except Exception as e:
        print(f"‚ùå Loading failed: {e}")
        return None
    
    # Fixed train/test split (not configurable)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, 
        test_size=0.3,  # Hardcoded
        random_state=42,  # Fixed seed
        stratify=y  # Not always appropriate
    )
    
    # Model with fixed hyperparameters
    model = LogisticRegression(
        random_state=42,  # Fixed
        max_iter=100,  # Could be insufficient
        solver='lbfgs'  # Not optimized
    )
    
    # Training without monitoring or early stopping
    model.fit(X_train, y_train)
    
    # Manual artifact saving
    model_path = os.path.join(DATA_DIR, 'model.pkl')
    test_X_path = os.path.join(DATA_DIR, 'X_test.pkl')
    test_y_path = os.path.join(DATA_DIR, 'y_test.pkl')
    
    with open(model_path, 'wb') as f:
        pickle.dump(model, f)
    with open(test_X_path, 'wb') as f:
        pickle.dump(X_test, f)
    with open(test_y_path, 'wb') as f:
        pickle.dump(y_test, f)
    
    print(f"ü§ñ Model trained successfully")
    print(f"üìä Training samples: {len(X_train)}")
    print(f"üß™ Test samples: {len(X_test)}")
    
    return model, X_test, y_test, model_path, test_X_path, test_y_path

# Execute (hoping previous cells worked!)
try:
    model, X_test, y_test, model_path, test_X_path, test_y_path = train_model(X_path, y_path)
except Exception as e:
    print(f"‚ùå Training failed: {e}")
    print("Make sure all previous cells ran successfully!")

In [None]:
# Model Evaluation - manual metric calculation
def evaluate_model(model_path, test_X_path, test_y_path):
    """Evaluate model - limited metrics and reporting"""
    print("\n=== Step 4: Model Evaluation ===")
    
    # Manual loading again
    try:
        with open(model_path, 'rb') as f:
            model = pickle.load(f)
        with open(test_X_path, 'rb') as f:
            X_test = pickle.load(f)
        with open(test_y_path, 'rb') as f:
            y_test = pickle.load(f)
    except Exception as e:
        print(f"‚ùå Failed to load artifacts: {e}")
        return None
    
    # Basic predictions
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)
    
    # Simple metrics (not comprehensive)
    accuracy = accuracy_score(y_test, y_pred)
    
    print(f"üìà Accuracy: {accuracy:.3f}")
    print("\nüìã Classification Report:")
    print(classification_report(y_test, y_pred, target_names=['Negative', 'Positive']))
    
    # Manual results saving (inconsistent format)
    results = {
        'accuracy': float(accuracy),
        'predictions': y_pred.tolist(),
        'actuals': y_test.tolist(),
        'timestamp': pd.Timestamp.now().isoformat()
    }
    
    results_path = os.path.join(DATA_DIR, 'results.json')
    with open(results_path, 'w') as f:
        json.dump(results, f, indent=2)
    
    return results, y_pred, y_proba

# Execute evaluation
try:
    results, y_pred, y_proba = evaluate_model(model_path, test_X_path, test_y_path)
    print(f"\nüíæ Results saved to: {os.path.join(DATA_DIR, 'results.json')}")
except Exception as e:
    print(f"‚ùå Evaluation failed: {e}")

In [None]:
# Visualization - manual plotting with potential display issues
def create_visualizations():
    """Create basic visualizations - limited interactivity"""
    print("\n=== Step 5: Visualization ===")
    
    # Setup plots (might not display properly in all environments)
    plt.style.use('default')
    fig, axes = plt.subplots(2, 2, figsize=(12, 10))
    fig.suptitle('Model Performance Analysis', fontsize=16)
    
    try:
        # Confusion Matrix
        cm = confusion_matrix(y_test, y_pred)
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                   xticklabels=['Negative', 'Positive'], 
                   yticklabels=['Negative', 'Positive'],
                   ax=axes[0,0])
        axes[0,0].set_title('Confusion Matrix')
        axes[0,0].set_ylabel('True Label')
        axes[0,0].set_xlabel('Predicted Label')
        
        # Probability Distribution
        positive_probs = y_proba[:, 1]
        axes[0,1].hist(positive_probs, bins=min(10, len(positive_probs)), 
                      alpha=0.7, edgecolor='black', color='skyblue')
        axes[0,1].set_title('Positive Class Probabilities')
        axes[0,1].set_xlabel('Probability')
        axes[0,1].set_ylabel('Count')
        
        # Feature Importance (top features only)
        feature_names = vectorizer.get_feature_names_out()
        coefficients = model.coef_[0]
        
        # Get top 10 features by absolute coefficient value
        feature_importance = list(zip(abs(coefficients), coefficients, feature_names))
        feature_importance.sort(reverse=True)
        top_features = feature_importance[:10]
        
        features = [item[2] for item in top_features]
        coefs = [item[1] for item in top_features]
        colors = ['red' if c < 0 else 'green' for c in coefs]
        
        y_pos = range(len(features))
        axes[1,0].barh(y_pos, coefs, color=colors, alpha=0.7)
        axes[1,0].set_yticks(y_pos)
        axes[1,0].set_yticklabels(features, fontsize=8)
        axes[1,0].set_title('Top 10 Feature Weights')
        axes[1,0].set_xlabel('Coefficient Value')
        
        # Simple accuracy bar
        axes[1,1].bar(['Accuracy'], [results['accuracy']], color='lightgreen', alpha=0.7)
        axes[1,1].set_ylim(0, 1)
        axes[1,1].set_title('Model Accuracy')
        axes[1,1].set_ylabel('Score')
        
        plt.tight_layout()
        plt.show()
        
        # Save plot manually (format hardcoded)
        plot_path = os.path.join(DATA_DIR, 'performance_plots.png')
        fig.savefig(plot_path, dpi=300, bbox_inches='tight')
        print(f"üìä Plots saved to: {plot_path}")
        
    except Exception as e:
        print(f"‚ùå Visualization failed: {e}")

# Execute visualization
try:
    create_visualizations()
except NameError as e:
    print(f"‚ùå Missing variables: {e}")
    print("Run all previous cells in order!")

In [None]:
# Manual testing - no systematic validation
def test_new_samples():
    """Test with new data - manual and error-prone"""
    print("\n=== Step 6: Manual Testing ===")
    
    # Hardcoded test samples
    test_reviews = [
        "This is absolutely fantastic!",
        "Complete garbage, avoid at all costs",
        "It's okay, nothing special",
        "Exceeded my expectations completely!"
    ]
    
    try:
        # Manual prediction process
        X_new = vectorizer.transform(test_reviews)
        predictions = model.predict(X_new)
        probabilities = model.predict_proba(X_new)
        
        print("üîç Manual Test Results:")
        print("-" * 60)
        
        for i, review in enumerate(test_reviews):
            sentiment = "üòä Positive" if predictions[i] == 1 else "üòû Negative"
            confidence = max(probabilities[i]) * 100
            
            print(f"Review: '{review[:50]}{'...' if len(review) > 50 else ''}'")
            print(f"Prediction: {sentiment} (Confidence: {confidence:.1f}%)")
            print("-" * 60)
            
    except Exception as e:
        print(f"‚ùå Testing failed: {e}")
        print("Ensure all previous cells have been executed!")

# Execute testing
test_new_samples()


In [None]:
def list_artifacts():
    """Manual artifact inspection - no metadata tracking"""
    print("\n=== Step 7: Artifact Management ===")
    
    if not os.path.exists(DATA_DIR):
        print("‚ùå No artifacts directory found!")
        return
    
    print(f"üìÇ Artifacts in '{DATA_DIR}':")
    total_size = 0
    
    for file in os.listdir(DATA_DIR):
        file_path = os.path.join(DATA_DIR, file)
        if os.path.isfile(file_path):
            size = os.path.getsize(file_path)
            total_size += size
            modified = os.path.getmtime(file_path)
            mod_time = pd.Timestamp.fromtimestamp(modified).strftime('%Y-%m-%d %H:%M:%S')
            
            print(f"  üìÑ {file}")
            print(f"     Size: {size:,} bytes")
            print(f"     Modified: {mod_time}")
            print()
    
    print(f"üíæ Total size: {total_size:,} bytes")
    
    # Manual validation (often skipped)
    required_files = ['raw_data.pkl', 'model.pkl', 'vectorizer.pkl', 'results.json']
    missing_files = [f for f in required_files if f not in os.listdir(DATA_DIR)]
    
    if missing_files:
        print(f"‚ö†Ô∏è  Missing files: {missing_files}")
    else:
        print("‚úÖ All required artifacts present")

list_artifacts()

In [None]:
# Pipeline completion summary - manual tracking
def pipeline_summary():
    """Show what we accomplished and the problems we faced"""
    print("\n" + "="*60)
    print("üéØ PIPELINE EXECUTION SUMMARY")
    print("="*60)
    
    steps = [
        "‚úÖ Data Preparation",
        "‚úÖ Feature Engineering", 
        "‚úÖ Model Training",
        "‚úÖ Model Evaluation",
        "‚úÖ Visualization",
        "‚úÖ Manual Testing",
        "‚úÖ Artifact Management"
    ]
    
    for step in steps:
        print(step)
    
    try:
        final_accuracy = results['accuracy']
        print(f"\nüéØ Final Model Accuracy: {final_accuracy:.3f}")
    except:
        print("\n‚ùå Could not retrieve final accuracy")
    
    print("\n" + "="*60)
    print("üö® PROBLEMS WITH THIS APPROACH:")
    print("="*60)
    
    problems = [
        "‚ùå Manual cell execution order dependency",
        "‚ùå No automatic error handling or rollback", 
        "‚ùå Hard to reproduce across different environments",
        "‚ùå No parameter management or experimentation tracking",
        "‚ùå Manual file and state management",
        "‚ùå No scalability or parallel execution",
        "‚ùå No automated testing or validation",
        "‚ùå Difficult to deploy to production",
        "‚ùå No monitoring or logging infrastructure",
        "‚ùå Version control challenges with notebooks",
        "‚ùå No resource management or optimization",
        "‚ùå Manual cleanup and artifact management"
    ]
    
    for problem in problems:
        print(problem)
    
    print("\n" + "="*60)
    print("üí° KUBEFLOW ADVANTAGES:")
    print("="*60)
    
    advantages = [
        "‚úÖ Automatic orchestration and dependency management",
        "‚úÖ Containerized, reproducible environments",
        "‚úÖ Automatic resource management and scaling", 
        "‚úÖ Built-in experiment tracking and versioning",
        "‚úÖ Easy deployment and serving capabilities",
        "‚úÖ Automated monitoring and logging",
        "‚úÖ Parameter management and hyperparameter tuning",
        "‚úÖ Parallel execution and optimization",
        "‚úÖ Production-ready infrastructure",
        "‚úÖ Team collaboration and sharing",
        "‚úÖ Automated testing and validation frameworks"
    ]
    
    for advantage in advantages:
        print(advantage)
    
    print("\nüöÄ Ready to move to Kubeflow? Your pipeline will be:")
    print("   ‚Ä¢ More reliable and reproducible")
    print("   ‚Ä¢ Easier to scale and deploy") 
    print("   ‚Ä¢ Better for team collaboration")
    print("   ‚Ä¢ Production-ready from day one")

pipeline_summary()

In [None]:
# Manual cleanup - often forgotten or incomplete
print("üßπ MANUAL CLEANUP")
print("=" * 40)

# Optional cleanup (user might skip this)
cleanup_choice = input("Delete temporary files? (y/N): ").lower().strip()

if cleanup_choice == 'y':
    try:
        cleanup_temp_files()
        print("‚úÖ Cleanup completed")
    except Exception as e:
        print(f"‚ùå Cleanup failed: {e}")
        print("You may need to manually delete the files later")
else:
    print("‚ö†Ô∏è  Temporary files kept - remember to clean up later!")
    print(f"   Directory: {DATA_DIR}")
    print("   This can accumulate and use disk space over time")

print("\nüí≠ In Kubeflow: Automatic cleanup and resource management!")


In [None]:
## üìù Notebook Execution Notes:

### Common Issues When Running This Notebook:
1. **Cell Execution Order**: Must run cells in exact sequence
2. **Variable Dependencies**: Later cells fail if earlier ones haven't run
3. **Environment Issues**: Package installations may differ across systems
4. **File Path Problems**: Manual path management leads to errors
5. **Memory Management**: No automatic cleanup of large objects
6. **State Confusion**: Easy to lose track of what's been executed

### Why Kubeflow is Better:
- **Automatic Orchestration**: No manual cell execution order
- **Reproducible Environments**: Containerized components
- **Parameter Management**: Easy configuration without code changes
- **Scalability**: Automatic resource allocation and parallel execution
- **Production Ready**: Built-in monitoring, logging, and deployment
- **Team Collaboration**: Shareable, version-controlled pipelines