# Model Training and Validation

This notebook demonstrates training and validation of various machine learning models for soil health prediction.

## Objectives:
- Train SVM, ANN, and clustering models
- Compare model performance
- Optimize hyperparameters
- Evaluate results comprehensively

In [None]:
# Import required libraries
import sys
import os
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix

# Import custom modules
from models.svm_model import SoilHealthSVM
from models.ann_model import SoilHealthANN
from models.clustering_model import SoilHealthClustering, compare_clustering_algorithms
from evaluation import ModelEvaluator, ModelComparison, feature_importance_analysis
from data_preprocessing import SoilDataPreprocessor

# Set random seed for reproducibility
np.random.seed(42)

# Configure plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
%matplotlib inline

## 1. Data Loading and Preprocessing

In [None]:
# Load the dataset
data_path = '../data/soil_health_dataset.csv'
df = pd.read_csv(data_path)

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nFirst few rows:")
df.head()

In [None]:
# Initialize preprocessor and preprocess data
preprocessor = SoilDataPreprocessor()
df_processed = preprocessor.fit_transform(df)

print(f"Processed dataset shape: {df_processed.shape}")
print(f"\nFeature columns after preprocessing:")
print(list(df_processed.columns))

In [None]:
# Prepare features and targets
# Exclude non-feature columns
exclude_cols = ['field_id', 'measurement_date', 'health_category', 'soil_health_score', 'recommendations']
feature_cols = [col for col in df_processed.columns if col not in exclude_cols]

X = df_processed[feature_cols]
y_classification = df_processed['health_category'] if 'health_category' in df_processed.columns else None
y_regression = df_processed['soil_health_score'] if 'soil_health_score' in df_processed.columns else None

print(f"Features shape: {X.shape}")
print(f"Feature columns: {list(X.columns)}")

if y_classification is not None:
    print(f"\nClassification target distribution:")
    print(y_classification.value_counts())

if y_regression is not None:
    print(f"\nRegression target statistics:")
    print(y_regression.describe())

## 2. Data Splitting

In [None]:
# Split data for classification and regression tasks
test_size = 0.2
val_size = 0.2  # 20% of training data for validation

if y_classification is not None:
    # Classification split
    X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(
        X, y_classification, test_size=test_size, random_state=42, stratify=y_classification
    )
    
    X_train_class, X_val_class, y_train_class, y_val_class = train_test_split(
        X_train_class, y_train_class, test_size=val_size, random_state=42, stratify=y_train_class
    )
    
    print(f"Classification splits:")
    print(f"Train: {X_train_class.shape}, Validation: {X_val_class.shape}, Test: {X_test_class.shape}")

if y_regression is not None:
    # Regression split
    X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
        X, y_regression, test_size=test_size, random_state=42
    )
    
    X_train_reg, X_val_reg, y_train_reg, y_val_reg = train_test_split(
        X_train_reg, y_train_reg, test_size=val_size, random_state=42
    )
    
    print(f"\nRegression splits:")
    print(f"Train: {X_train_reg.shape}, Validation: {X_val_reg.shape}, Test: {X_test_reg.shape}")

## 3. SVM Model Training

In [None]:
# SVM Classification
if y_classification is not None:
    print("=== SVM Classification ===")
    
    # Initialize SVM classifier
    svm_classifier = SoilHealthSVM(task_type='classification')
    
    # Create and train model
    svm_classifier.create_model(kernel='rbf', C=1.0, gamma='scale')
    svm_classifier.train(X_train_class, y_train_class)
    
    # Evaluate on validation set
    val_results_svm_class = svm_classifier.evaluate(X_val_class, y_val_class)
    
    # Hyperparameter tuning
    print("\nPerforming hyperparameter tuning...")
    best_params_svm_class = svm_classifier.tune_hyperparameters(
        X_train_class, y_train_class, cv=3
    )
    print(f"Best parameters: {best_params_svm_class}")
    
    # Train with best parameters
    svm_classifier.create_model(**best_params_svm_class)
    svm_classifier.train(X_train_class, y_train_class)
    
    # Final evaluation
    print("\nFinal evaluation on test set:")
    test_results_svm_class = svm_classifier.evaluate(X_test_class, y_test_class)

In [None]:
# SVM Regression
if y_regression is not None:
    print("=== SVM Regression ===")
    
    # Initialize SVM regressor
    svm_regressor = SoilHealthSVM(task_type='regression')
    
    # Create and train model
    svm_regressor.create_model(kernel='rbf', C=1.0, gamma='scale', epsilon=0.1)
    svm_regressor.train(X_train_reg, y_train_reg)
    
    # Evaluate on validation set
    val_results_svm_reg = svm_regressor.evaluate(X_val_reg, y_val_reg)
    
    # Hyperparameter tuning
    print("\nPerforming hyperparameter tuning...")
    best_params_svm_reg = svm_regressor.tune_hyperparameters(
        X_train_reg, y_train_reg, cv=3
    )
    print(f"Best parameters: {best_params_svm_reg}")
    
    # Train with best parameters
    svm_regressor.create_model(**best_params_svm_reg)
    svm_regressor.train(X_train_reg, y_train_reg)
    
    # Final evaluation
    print("\nFinal evaluation on test set:")
    test_results_svm_reg = svm_regressor.evaluate(X_test_reg, y_test_reg)

## 4. ANN Model Training

In [None]:
# ANN Classification
if y_classification is not None:
    print("=== ANN Classification ===")
    
    # Initialize ANN classifier
    ann_classifier = SoilHealthANN(task_type='classification')
    
    # Train model
    history_class = ann_classifier.train(
        X_train_class, y_train_class,
        X_val_class, y_val_class,
        epochs=100,
        batch_size=32,
        verbose=1
    )
    
    # Plot training history
    ann_classifier.plot_training_history()
    
    # Evaluate on test set
    print("\nFinal evaluation on test set:")
    test_results_ann_class = ann_classifier.evaluate(X_test_class, y_test_class)
    
    # Plot confusion matrix
    ann_classifier.plot_confusion_matrix(X_test_class, y_test_class)

In [None]:
# ANN Regression
if y_regression is not None:
    print("=== ANN Regression ===")
    
    # Initialize ANN regressor
    ann_regressor = SoilHealthANN(task_type='regression')
    
    # Train model
    history_reg = ann_regressor.train(
        X_train_reg, y_train_reg,
        X_val_reg, y_val_reg,
        epochs=100,
        batch_size=32,
        verbose=1
    )
    
    # Plot training history
    ann_regressor.plot_training_history()
    
    # Evaluate on test set
    print("\nFinal evaluation on test set:")
    test_results_ann_reg = ann_regressor.evaluate(X_test_reg, y_test_reg)

## 5. Clustering Analysis

In [None]:
# Compare different clustering algorithms
print("=== Clustering Analysis ===")

# Use full dataset for clustering
clustering_results = compare_clustering_algorithms(
    X.values, list(X.columns), n_clusters=4
)

In [None]:
# Detailed K-Means analysis
print("\n=== Detailed K-Means Analysis ===")

kmeans_clusterer = SoilHealthClustering(algorithm='kmeans')

# Find optimal number of clusters
kmeans_clusterer.find_optimal_clusters(X.values, max_clusters=8, method='silhouette')

# Fit final model with optimal clusters
kmeans_clusterer.create_model(n_clusters=4)
cluster_labels = kmeans_clusterer.fit(X.values, list(X.columns))

# Analyze clusters
cluster_stats = kmeans_clusterer.analyze_clusters(X.values, list(X.columns))

# 3D visualization
kmeans_clusterer.plot_clusters_3d()

# Add cluster labels to original data
df_with_clusters = df.copy()
df_with_clusters['Cluster'] = cluster_labels

## 6. Model Comparison

In [None]:
# Compare classification models
if y_classification is not None:
    print("=== Classification Model Comparison ===")
    
    comparison_class = ModelComparison()
    comparison_class.add_model('SVM', svm_classifier.model, 'classification')
    comparison_class.add_model('ANN', ann_classifier.model, 'classification')
    
    class_comparison_results = comparison_class.compare_models(X_test_class, y_test_class)
    print("\nClassification Comparison Results:")
    print(class_comparison_results)

In [None]:
# Compare regression models
if y_regression is not None:
    print("=== Regression Model Comparison ===")
    
    comparison_reg = ModelComparison()
    comparison_reg.add_model('SVM', svm_regressor.model, 'regression')
    comparison_reg.add_model('ANN', ann_regressor.model, 'regression')
    
    reg_comparison_results = comparison_reg.compare_models(X_test_reg, y_test_reg)
    print("\nRegression Comparison Results:")
    print(reg_comparison_results)

## 7. Feature Importance Analysis

In [None]:
# Feature importance analysis
print("=== Feature Importance Analysis ===")

# Train a Random Forest for feature importance comparison
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

if y_classification is not None:
    # Classification feature importance
    rf_class = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_class.fit(X_train_class, y_train_class)
    
    print("\nClassification Feature Importance:")
    class_importance = feature_importance_analysis(
        rf_class, list(X.columns), X_test_class, y_test_class, method='auto'
    )

if y_regression is not None:
    # Regression feature importance
    rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_reg.fit(X_train_reg, y_train_reg)
    
    print("\nRegression Feature Importance:")
    reg_importance = feature_importance_analysis(
        rf_reg, list(X.columns), X_test_reg, y_test_reg, method='auto'
    )

## 8. Cross-Validation Analysis

In [None]:
# Cross-validation for best models
print("=== Cross-Validation Analysis ===")

if y_classification is not None:
    print("\nSVM Classification Cross-Validation:")
    svm_class_evaluator = ModelEvaluator(svm_classifier.model, "SVM Classifier")
    svm_class_cv = svm_class_evaluator.cross_validate(X_train_class, y_train_class, cv=5)
    
    print("\nANN Classification Cross-Validation:")
    # Note: For neural networks, we'll use a simpler approach
    from sklearn.model_selection import cross_val_score
    from sklearn.neural_network import MLPClassifier
    
    mlp_class = MLPClassifier(hidden_layer_sizes=(128, 64, 32), max_iter=500, random_state=42)
    ann_class_cv_scores = cross_val_score(mlp_class, X_train_class, y_train_class, cv=5, scoring='accuracy')
    print(f"ANN CV Accuracy: {ann_class_cv_scores.mean():.4f} (+/- {ann_class_cv_scores.std() * 2:.4f})")

if y_regression is not None:
    print("\nSVM Regression Cross-Validation:")
    svm_reg_evaluator = ModelEvaluator(svm_regressor.model, "SVM Regressor")
    svm_reg_cv = svm_reg_evaluator.cross_validate(X_train_reg, y_train_reg, cv=5)
    
    print("\nANN Regression Cross-Validation:")
    from sklearn.neural_network import MLPRegressor
    
    mlp_reg = MLPRegressor(hidden_layer_sizes=(128, 64, 32), max_iter=500, random_state=42)
    ann_reg_cv_scores = cross_val_score(mlp_reg, X_train_reg, y_train_reg, cv=5, scoring='neg_mean_squared_error')
    print(f"ANN CV RMSE: {np.sqrt(-ann_reg_cv_scores.mean()):.4f} (+/- {np.sqrt(ann_reg_cv_scores.std() * 2):.4f})")

## 9. Learning Curves

In [None]:
# Plot learning curves for best models
print("=== Learning Curves ===")

if y_classification is not None:
    print("\nSVM Classification Learning Curve:")
    svm_class_evaluator.plot_learning_curve(X_train_class, y_train_class)

if y_regression is not None:
    print("\nSVM Regression Learning Curve:")
    svm_reg_evaluator.plot_learning_curve(X_train_reg, y_train_reg)

## 10. Model Saving

In [None]:
# Save trained models
import os
os.makedirs('../models', exist_ok=True)

print("=== Saving Models ===")

if y_classification is not None:
    # Save classification models
    svm_classifier.save_model('../models/svm_classifier')
    ann_classifier.save_model('../models/ann_classifier')
    print("Classification models saved.")

if y_regression is not None:
    # Save regression models
    svm_regressor.save_model('../models/svm_regressor')
    ann_regressor.save_model('../models/ann_regressor')
    print("Regression models saved.")

# Save preprocessing pipeline
import joblib
joblib.dump(preprocessor, '../models/preprocessor.pkl')
print("Preprocessor saved.")

print("\nAll models saved successfully!")

## 11. Summary and Conclusions

In [None]:
# Summary of results
print("=== TRAINING SUMMARY ===")
print()

if y_classification is not None:
    print("CLASSIFICATION RESULTS:")
    print(f"SVM Test Accuracy: {test_results_svm_class.get('accuracy', 'N/A'):.4f}")
    print(f"ANN Test Accuracy: {test_results_ann_class.get('accuracy', 'N/A'):.4f}")
    print()

if y_regression is not None:
    print("REGRESSION RESULTS:")
    print(f"SVM Test RMSE: {test_results_svm_reg.get('rmse', 'N/A'):.4f}")
    print(f"SVM Test R²: {test_results_svm_reg.get('r2_score', 'N/A'):.4f}")
    print(f"ANN Test RMSE: {test_results_ann_reg.get('rmse', 'N/A'):.4f}")
    print(f"ANN Test R²: {test_results_ann_reg.get('r2_score', 'N/A'):.4f}")
    print()

print("CLUSTERING RESULTS:")
for algorithm, metrics in clustering_results.items():
    print(f"{algorithm.upper()}: Silhouette Score = {metrics.get('silhouette_score', 'N/A'):.4f}")

print("\n" + "="*50)
print("TRAINING COMPLETED SUCCESSFULLY!")
print("All models have been trained, evaluated, and saved.")
print("="*50)