In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import shap

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load and prepare data
df = pd.read_csv('Consumption of meat per capita.csv')

# Create features and target
features = ['Year', 'Poultry', 'Beef', 'Sheep and goat', 'Other meats', 'Fish and seafood']
target = 'Pork'

# Remove rows with missing values
df_clean = df.dropna(subset=features + [target])

X = df_clean[features]
y = df_clean[target]

In [3]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train basic XGBoost model
xgb_model = XGBRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=42
)

# Train model
xgb_model.fit(
    X_train_scaled, 
    y_train,
    eval_set=[(X_test_scaled, y_test)],
    verbose=False
)

# Make predictions
y_pred = xgb_model.predict(X_test_scaled)

In [4]:
# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

display(print("Model Performance Metrics:"))
display(print(f"MSE: {mse:.4f}"))
display(print(f"RMSE: {rmse:.4f}"))
display(print(f"MAE: {mae:.4f}"))
display(print(f"R2 Score: {r2:.4f}"))

Model Performance Metrics:


None

MSE: 59.7389


None

RMSE: 7.7291


None

MAE: 4.8322


None

R2 Score: 0.7309


None

In [7]:
# Hyperparameter tuning
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.3],
    'n_estimators': [100, 200, 300],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.8, 0.9, 1.0]
}

grid_search = GridSearchCV(
    estimator=XGBRegressor(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=1
)

display(X_train_scaled)
display(y_train.head())

grid_search.fit(X_train_scaled, y_train)

AttributeError: 'numpy.ndarray' object has no attribute 'head'

In [None]:
# Print best parameters and score
display(print("\nBest Parameters:", grid_search.best_params_))
display(print("Best Score:", np.sqrt(-grid_search.best_score_)))

In [None]:
# Train final model with best parameters
best_xgb = grid_search.best_estimator_
best_xgb.fit(X_train_scaled, y_train)
best_pred = best_xgb.predict(X_test_scaled)

# Calculate final metrics
final_mse = mean_squared_error(y_test, best_pred)
final_r2 = r2_score(y_test, best_pred)