In [None]:
# Wine Quality Prediction - Model Training
# This notebook trains a model to predict wine quality (1-10 scale)

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
import pickle
import mlflow
import mlflow.sklearn
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt

# For this example, we'll create synthetic wine data since we need to avoid dataset conflicts
# In real implementation, you would use: wine_data = pd.read_csv('winequality.csv')

# Create synthetic wine quality data
np.random.seed(42)
n_samples = 1000

# Generate features similar to wine dataset
data = {
    'fixed_acidity': np.random.normal(7.5, 1.5, n_samples),
    'volatile_acidity': np.random.normal(0.4, 0.2, n_samples),
    'citric_acid': np.random.normal(0.3, 0.15, n_samples),
    'residual_sugar': np.random.normal(5, 3, n_samples),
    'chlorides': np.random.normal(0.05, 0.02, n_samples),
    'free_sulfur_dioxide': np.random.normal(30, 15, n_samples),
    'total_sulfur_dioxide': np.random.normal(120, 40, n_samples),
    'density': np.random.normal(0.995, 0.003, n_samples),
    'pH': np.random.normal(3.2, 0.3, n_samples),
    'sulphates': np.random.normal(0.6, 0.2, n_samples),
    'alcohol': np.random.normal(10.5, 1.5, n_samples)
}

# Create target variable (wine quality 3-9, with 6 being most common)
quality_base = (
    0.3 * data['alcohol'] + 
    0.2 * (10 - data['volatile_acidity']) + 
    0.1 * data['citric_acid'] + 
    0.1 * data['sulphates'] + 
    np.random.normal(0, 0.5, n_samples)
)
quality_normalized = (quality_base - quality_base.min()) / (quality_base.max() - quality_base.min())
wine_quality = np.round(3 + quality_normalized * 6).astype(int)
data['quality'] = wine_quality

# Create DataFrame
df = pd.DataFrame(data)
print("Wine Quality Dataset Created")
print("Dataset shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())
print(f"\nQuality distribution:")
print(df['quality'].value_counts().sort_index())

# Prepare features and target
X = df.drop('quality', axis=1)
y = df['quality']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTraining set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Start MLflow experiment
mlflow.set_experiment("wine_quality_prediction")

with mlflow.start_run():
    # Model parameters
    n_estimators = 100
    max_depth = 10
    random_state = 42
    
    # Log parameters
    mlflow.log_param("n_estimators", n_estimators)
    mlflow.log_param("max_depth", max_depth)
    mlflow.log_param("random_state", random_state)
    mlflow.log_param("model_type", "RandomForestRegressor")
    mlflow.log_param("feature_scaling", "StandardScaler")
    
    # Train the model
    print("Training Random Forest model...")
    start_time = datetime.now()
    
    model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        random_state=random_state
    )
    
    model.fit(X_train_scaled, y_train)
    
    training_time = (datetime.now() - start_time).total_seconds()
    print(f"Training completed in {training_time:.2f} seconds")
    
    # Make predictions
    y_pred = model.predict(X_test_scaled)
    
    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Log metrics
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("r2_score", r2)
    mlflow.log_metric("training_time", training_time)
    
    print(f"\nModel Performance:")
    print(f"Mean Squared Error: {mse:.4f}")
    print(f"Root Mean Squared Error: {rmse:.4f}")
    print(f"Mean Absolute Error: {mae:.4f}")
    print(f"R² Score: {r2:.4f}")
    
    # Log the model
    mlflow.sklearn.log_model(
        model, 
        "wine_quality_model",
        registered_model_name="WineQualityPredictor"
    )
    
    # Feature importance
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print(f"\nTop 5 Most Important Features:")
    print(feature_importance.head())

# Save the model and scaler locally
print("\nSaving model and scaler...")
with open('../ml/model.pkl', 'wb') as f:
    pickle.dump(model, f)

with open('../ml/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

# Save feature names for the API
feature_names = list(X.columns)
with open('../ml/feature_names.pkl', 'wb') as f:
    pickle.dump(feature_names, f)

print("Model training completed successfully!")
print("Files saved:")
print("- ../ml/model.pkl")
print("- ../ml/scaler.pkl") 
print("- ../ml/feature_names.pkl")

# Display sample predictions
print(f"\nSample Predictions (first 10 test samples):")
for i in range(min(10, len(y_test))):
    print(f"Actual: {y_test.iloc[i]}, Predicted: {y_pred[i]:.2f}")