In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import xgboost as xgb
# Suppress warnings for a cleaner notebook
warnings.filterwarnings('ignore')

In [None]:
# --- 1. Load Data ---
print("Loading data...")
train_df = pd.read_csv("/content/drive/MyDrive/ML PROJECT/train.csv")
test_df = pd.read_csv("/content/drive/MyDrive/ML PROJECT/test.csv")
sample_submission_df = pd.read_csv("/content/drive/MyDrive/ML PROJECT/sample_submission.csv")


print("Data loaded successfully.")
print("\n--- Training Data Info ---")
train_df.info()

print("\n--- Training Data Head ---")
print(train_df.head())



# --- 2. Data Preprocessing ---
print("\nStarting preprocessing...")

# Store test IDs for final submission
test_ids = test_df['id']

# Separate target variable (y) from training features (X)
y = train_df['WeightCategory']
X = train_df.drop(columns=['id', 'WeightCategory'])

# Store test features (X_test)
X_test = test_df.drop(columns=['id'])

# --- 2a. Encode the Target Variable (y) ---
# We convert text labels (e.g., 'Normal_Weight') into numbers (0, 1, 2...)
le = LabelEncoder()
y_encoded = le.fit_transform(y)
num_classes = len(le.classes_)
print(f"\nTarget variable encoded into {num_classes} classes.")
print(f"Classes: {le.classes_}")

# --- 2b. One-Hot Encoding for Categorical Features ---
# Get list of categorical columns (those with 'object' dtype)
categorical_cols = X.select_dtypes(include=['object']).columns
print(f"Categorical features to encode: {list(categorical_cols)}")

# We combine train and test to ensure they have the exact same dummy columns
combined_df = pd.concat([X, X_test], axis=0)
combined_df_processed = pd.get_dummies(combined_df, columns=categorical_cols, drop_first=True)

# Separate back into X and X_test
X_processed = combined_df_processed.iloc[:len(X)]
X_test_processed = combined_df_processed.iloc[len(X):]

print(f"Original feature count: {len(X.columns)}")
print(f"Processed feature count after one-hot encoding: {len(X_processed.columns)}")

In [None]:
# --- 3. Model Training & Validation ---
print("\nSplitting data for training and validation...")

# Split the *processed* training data into a new training set and a validation set
# This lets us check our model's performance on data it hasn't seen
X_train, X_val, y_train, y_val = train_test_split(
    X_processed,
    y_encoded,
    test_size=0.2,      # 20% for validation
    random_state=42,    # For reproducible results
    stratify=y_encoded  # Ensures class distribution is the same in train and val
)

print(f"Training set shape: {X_train.shape}")
print(f"Validation set shape: {X_val.shape}")

print("\n--- Starting XGBoost Model Training ---")

# Initialize the XGBoost Classifier
# You can tune hyperparameters like n_estimators, learning_rate, max_depth, etc.
model = xgb.XGBClassifier(objective='multi:softmax',  # For multi-class classification
                          num_class=num_classes,      # Number of target classes
                          n_estimators=100,           # Number of boosting rounds
                          learning_rate=0.1,          # Step size shrinkage
                          max_depth=5,                # Maximum depth of trees
                          random_state=42,            # For reproducible results
                          use_label_encoder=False,    # Recommended for newer versions
                          eval_metric='mlogloss')     # Evaluation metric

# Train the model
model.fit(X_train, y_train)

print("--- Model Training Complete ---")

# --- 4. Model Evaluation ---
print("\n--- Model Evaluation on Validation Set ---")
# Predict on the validation set
y_pred_val = model.predict(X_val)

# Calculate and print accuracy
accuracy = accuracy_score(y_val, y_pred_val)
print(f"Validation Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report (Validation):")
# Print classification report with original text labels
print(classification_report(y_val, y_pred_val, target_names=le.classes_))

# --- 5. Final Prediction & Submission ---
print("\n--- Generating Final Submission File ---")

# Now, we train a new model on the *ENTIRE* training dataset
# This ensures the model learns from all available data
print("Training final model on all data...")
final_model = xgb.XGBClassifier(objective='multi:softmax',
                                num_class=num_classes,
                                n_estimators=100,
                                learning_rate=0.1,
                                max_depth=5,
                                random_state=42, # For reproducible results
                                use_label_encoder=False,
                                eval_metric='mlogloss')
final_model.fit(X_processed, y_encoded)
print("Final model trained.")

# Make predictions on the processed test data
test_predictions_encoded = final_model.predict(X_test_processed)

# Convert the numeric predictions back to their original text labels
test_predictions = le.inverse_transform(test_predictions_encoded)

# Create the submission DataFrame
submission_df = pd.DataFrame({
    'id': test_ids,
    'WeightCategory': test_predictions
})

# Save the submission file
submission_df.to_csv("submission.csv", index=False)

print("\n--- Submission File Created! ---")
print("File 'submission.csv' is ready.")
print(submission_df.head())

In [None]:
# from sklearn.model_selection import StratifiedKFold
# from sklearn.metrics import accuracy_score, classification_report

# # --- 6. K-Fold Cross-Validation ---
# print("\n--- Starting K-Fold Cross-Validation ---")

# # Define the number of folds (k)
# n_splits = 5 # You can change this number

# # Initialize StratifiedKFold
# skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# # Lists to store evaluation results from each fold
# fold_accuracies = []
# fold_reports = []

# # Iterate over each fold
# for fold, (train_index, val_index) in enumerate(skf.split(X_processed, y_encoded)):
#     print(f"\n--- Fold {fold+1}/{n_splits} ---")

#     # Split data into training and validation sets for the current fold
#     X_train_fold, X_val_fold = X_processed.iloc[train_index], X_processed.iloc[val_index]
#     y_train_fold, y_val_fold = y_encoded[train_index], y_encoded[val_index]

#     # Initialize a new XGBoost model for each fold
#     # Using the same hyperparameters as before
#     fold_model = xgb.XGBClassifier(objective='multi:softmax',
#                                    num_class=num_classes,
#                                    n_estimators=100,
#                                    learning_rate=0.1,
#                                    max_depth=5,
#                                    random_state=42, # Keep random_state for reproducibility within each fold
#                                    use_label_encoder=False,
#                                    eval_metric='mlogloss')

#     # Train the model on the training data for the current fold
#     fold_model.fit(X_train_fold, y_train_fold)

#     # Predict on the validation data for the current fold
#     y_pred_fold = fold_model.predict(X_val_fold)

#     # Evaluate the model on the validation data for the current fold
#     fold_accuracy = accuracy_score(y_val_fold, y_pred_fold)
#     fold_report = classification_report(y_val_fold, y_pred_fold, target_names=le.classes_)

#     print(f"Fold {fold+1} Validation Accuracy: {fold_accuracy * 100:.2f}%")
#     # print(f"Fold {fold+1} Classification Report:\n{fold_report}") # Uncomment to see report for each fold

#     # Store the results
#     fold_accuracies.append(fold_accuracy)
#     fold_reports.append(fold_report)

# # --- Summarize Cross-Validation Results ---
# print("\n--- Cross-Validation Summary ---")
# print(f"Average Validation Accuracy across {n_splits} folds: {np.mean(fold_accuracies) * 100:.2f}%")
# print(f"Standard Deviation of Validation Accuracy: {np.std(fold_accuracies) * 100:.2f}%")

# # Note: For a final model to use for prediction on the test set,
# # you would typically train on the entire training dataset (X_processed, y_encoded)
# # after determining the best hyperparameters (potentially using cross-validation).
# # The code for the final model training and submission (Section 5) remains the same
# # as it uses the entire training data.

# print("\nK-Fold Cross-Validation Complete.")

# Task
Perform hyperparameter tuning for the XGBoost model using cross-validation to find the best parameters for improved accuracy and then evaluate the model on the test set.

## Define hyperparameter search space

### Subtask:
Determine the range of values for the XGBoost hyperparameters that you want to explore (e.g., `n_estimators`, `learning_rate`, `max_depth`, etc.).


**Reasoning**:
Define the hyperparameter search space for XGBoost based on common tuning practices and the problem context.



In [None]:
# --- 7. Hyperparameter Tuning Setup ---
print("\n--- Setting up Hyperparameter Tuning ---")

# Define the hyperparameter search space for XGBoost
# We'll explore a few key parameters
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of boosting rounds
    'learning_rate': [0.01, 0.1, 0.2], # Step size shrinkage
    'max_depth': [3, 5, 7],           # Maximum depth of trees
    'subsample': [0.8, 1.0],          # Fraction of samples used per tree
    'colsample_bytree': [0.8, 1.0],   # Fraction of features used per tree
    'gamma': [0, 0.1, 0.2]            # Minimum loss reduction required to make a further partition
}

print("Hyperparameter search space defined:")
print(param_grid)

## Choose tuning method

### Subtask:
Decide whether to use Grid Search (exhaustive search) or Randomized Search (random sampling) for exploring the hyperparameter space. Randomized Search is often preferred for larger search spaces as it's computationally less expensive.


**Reasoning**:
Based on the defined hyperparameter search space and considering computational efficiency, I will decide whether to use GridSearchCV or RandomizedSearchCV and document the choice.



In [None]:
# --- 8. Choose Tuning Method ---
print("\n--- Choosing Hyperparameter Tuning Method ---")

# Analyze the size of the search space:
# n_estimators: 3 values
# learning_rate: 3 values
# max_depth: 3 values
# subsample: 2 values
# colsample_bytree: 2 values
# gamma: 3 values
# Total combinations = 3 * 3 * 3 * 2 * 2 * 3 = 162

# The search space has 162 combinations. While not extremely large,
# considering the number of folds in cross-validation (e.g., 5 folds),
# GridSearchCV would train 162 * 5 = 810 models.
# RandomizedSearchCV is generally more efficient for exploring a large search space
# and can often find a good set of hyperparameters more quickly.
# Given the moderate size and the potential for faster exploration,
# RandomizedSearchCV is a suitable choice.

chosen_method = "RandomizedSearchCV"
justification = (
    "The hyperparameter search space, while not massive (162 combinations), "
    "is large enough that exploring every combination with GridSearchCV "
    "across multiple cross-validation folds would be computationally intensive. "
    "RandomizedSearchCV allows for efficient exploration of the space by sampling "
    "a fixed number of combinations, which is likely to find a good set of parameters "
    "more quickly than an exhaustive search."
)

print(f"Chosen Hyperparameter Tuning Method: {chosen_method}")
print(f"Justification: {justification}")

# This step is for documentation and planning, no code execution required for tuning yet.

[link text](https://)**Reasoning**:
The decision to use RandomizedSearchCV has been made. The next step is to implement the hyperparameter tuning using RandomizedSearchCV with cross-validation on the training data.



In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold # Import StratifiedKFold

print("\n--- Starting Randomized Search (with Cross-Validation) for Hyperparameter Tuning ---")

# Initialize RandomizedSearchCV
# n_iter: number of parameter settings that are sampled.
# Increase this for a more exhaustive search, decrease for faster execution.
n_iter_search = 50 # Sampling 50 combinations out of 162

# Define the number of folds for cross-validation
n_splits = 5 # You can change this number

# Initialize StratifiedKFold for cross-validation
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

random_search = RandomizedSearchCV(
    estimator=xgb.XGBClassifier(objective='multi:softmax',
                                num_class=num_classes,
                                random_state=42,
                                use_label_encoder=False,
                                eval_metric='mlogloss'), # Pass base estimator without tuned params
    param_distributions=param_grid, # The grid defined earlier
    n_iter=n_iter_search,           # Number of combinations to sample
    scoring='accuracy',             # Metric to optimize
    cv=skf,                         # Use StratifiedKFold for cross-validation
    verbose=1,                      # Show progress
    random_state=42,                # For reproducible sampling
    n_jobs=-1                       # Use all available cores
)

# Fit RandomizedSearchCV on the *entire* processed training data (including BMI)
# Cross-validation will handle the splitting internally during the search
random_search.fit(X_processed, y_encoded)

print("\n--- Randomized Search Complete ---")

# --- Get the best parameters and best score ---
print("\nBest parameters found:")
print(random_search.best_params_)

# The best_score_ here is the average cross-validation score
print(f"\nAverage cross-validation accuracy with best parameters: {random_search.best_score_ * 100:.2f}%")

# The best model found by RandomizedSearchCV is available at random_search.best_estimator_
best_model = random_search.best_estimator_

# --- 5. Final Prediction & Submission (using the best parameters and entire data) ---
print("\n--- Generating Final Submission File (with tuned parameters) ---")

# Train a final model on the *entire* training dataset using the best parameters found
print("Training final model on all data with best parameters...")
final_model_tuned = xgb.XGBClassifier(objective='multi:softmax',
                                      num_class=num_classes,
                                      random_state=42,
                                      use_label_encoder=False,
                                      eval_metric='mlogloss',
                                      **random_search.best_params_) # Use the best parameters

# Train on the entire processed training data (including BMI)
final_model_tuned.fit(X_processed, y_encoded)
print("Final model trained with best parameters.")

# Make predictions on the processed test data (including BMI) using the final tuned model
test_predictions_encoded_tuned = final_model_tuned.predict(X_test_processed)

# Convert the numeric predictions back to their original text labels
test_predictions_tuned = le.inverse_transform(test_predictions_encoded_tuned)

# Create the submission DataFrame with tuned predictions
submission_df_tuned = pd.DataFrame({
    'id': test_ids,
    'WeightCategory': test_predictions_tuned
})

# Save the submission file with a clear name indicating tuning and CV
submission_df_tuned.to_csv("submission_tuned_cv_final.csv", index=False)

print("\n--- Tuned Submission File Created! (with Cross-Validation) ---")
print("File 'submission_tuned_cv_final.csv' is ready.")
print(submission_df_tuned.head())

In [None]:
# --- Feature Engineering: Calculate BMI ---
print("\n--- Calculating BMI ---")

# Calculate BMI for the processed combined dataframe
# BMI = Weight (kg) / [Height (m)]^2
# Assuming Height is in meters and Weight in kg as per common standards and column names
combined_df_processed['BMI'] = combined_df_processed['Weight'] / (combined_df_processed['Height']**2)

print("BMI calculated and added as a new feature.")

# Separate the data back into X_processed and X_test_processed with the new BMI feature
X_processed = combined_df_processed.iloc[:len(X)]
X_test_processed = combined_df_processed.iloc[len(X):]

print(f"Processed feature count after adding BMI: {len(X_processed.columns)}")

## Interpret Feature Importance and Summarize Findings

### Subtask:
Analyze the generated feature importance plot and the sorted feature importance list to identify the most influential features in the model's predictions. Summarize the key findings from the hyperparameter tuning process and the feature importance analysis. Discuss the performance of the final tuned model.

**Reasoning**:
Interpreting the feature importance helps in understanding the model's decision-making process and identifying which input variables have the strongest relationship with the target variable (Weight Category). Summarizing the findings provides a clear overview of the project's results, including the impact of hyperparameter tuning and the insights gained from feature analysis.

## Summary of Findings

Based on the feature importance analysis, the most important features for predicting Weight Category using the tuned XGBoost model are:

1.  **BMI**: As expected, Body Mass Index is the most significant predictor of Weight Category. This aligns with the definition of weight categories often being based on BMI ranges.
2.  **Gender_Male**: The gender of an individual is the second most important feature, suggesting a notable difference in weight category distribution between males and females in this dataset.
3.  **Weight**: While BMI is a combined measure, the individual weight of a person is also a strong predictor.
4.  **FCVC (Frequency of consumption of vegetables)**: This feature related to dietary habits shows significant importance, indicating that vegetable consumption frequency is a key factor in determining weight category.
5.  **FAVC_yes (Frequent consumption of high caloric food - yes)**: This also highlights the importance of dietary choices, specifically the consumption of high-calorie foods.

Other features like `CALC_no` (Consumption of alcohol - no), `CH2O` (Consumption of water daily), `CAEC_no` (Consumption of food between meals - no), and `CAEC_Frequently` (Consumption of food between meals - frequently) also show notable importance, emphasizing the role of various lifestyle and dietary factors.

**Hyperparameter Tuning and Model Performance:**

The Randomized Search with 5-fold cross-validation explored 50 different hyperparameter combinations. The best parameters found were:

In [None]:
# --- Analyze Feature Importance of the Final Tuned Model ---
print("\n--- Analyzing Feature Importance of the Final Tuned Model ---")

# Get feature importances from the final tuned model
feature_importances = final_model_tuned.feature_importances_

# Get the names of the features from the processed training data
feature_names = X_processed.columns

# Create a pandas Series for easier handling and sorting
importance_series = pd.Series(feature_importances, index=feature_names)

# Sort the features by importance in descending order
sorted_importance_series = importance_series.sort_values(ascending=False)

# Print the sorted feature importances
print("Feature Importances (Sorted):")
print(sorted_importance_series)

# Optional: Visualize the feature importances
plt.figure(figsize=(12, 7))
sorted_importance_series.plot(kind='bar')
plt.title('Feature Importance from Final Tuned XGBoost Model')
plt.ylabel('Importance')
plt.xlabel('Features')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

print("\n--- Feature Importance Analysis Complete ---")

In [None]:
# --- 9. Analyze Feature Importance ---
print("\n--- Analyzing Feature Importance of the Best Model ---")

# Get feature importances from the best model
feature_importances = best_model.feature_importances_

# Get the names of the features
# The order of feature_importances_ corresponds to the order of columns in X_processed
feature_names = X_processed.columns

# Create a pandas Series for easier handling and sorting
importance_series = pd.Series(feature_importances, index=feature_names)

# Sort the features by importance in descending order
sorted_importance_series = importance_series.sort_values(ascending=False)

# Print the sorted feature importances
print("Feature Importances (Sorted):")
print(sorted_importance_series)

# Optional: Visualize the feature importances
plt.figure(figsize=(10, 6))
sorted_importance_series.plot(kind='bar')
plt.title('Feature Importance from XGBoost Model')
plt.ylabel('Importance')
plt.xlabel('Features')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

print("\n--- Feature Importance Analysis Complete ---")