In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV

try:
    # Step 1: Load the dataset
    dataset_path = "ObesityDataSet_raw_and_data_sinthetic.csv"
    df = pd.read_csv(dataset_path)

    # Step 2: Preprocess the data
    # Drop irrelevant columns
    df = df.drop(['Gender', 'family_history_with_overweight','FAVC','CAEC','SMOKE','SCC','CALC','MTRANS'], axis=1)

    # Convert the target variable to numerical labels
    class_mapping = {
        'Insufficient_Weight': 0,
        'Normal_Weight': 1,
        'Overweight_Level_I': 2,
        'Overweight_Level_II': 3,
        'Obesity_Type_I': 4,
        'Obesity_Type_II': 5,
        'Obesity_Type_III': 6
    }
    df['NObeyesdad'] = df['NObeyesdad'].map(class_mapping)

    # Separate features (X) and target variable (y)
    X = df.drop(['NObeyesdad'], axis=1)
    y = df['NObeyesdad']

    # Step 3: Split the dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Step 4: Apply feature scaling
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Step 5: Train the classification models
    models = [
        RandomForestClassifier(random_state=42),
        SVC(random_state=42),
        KNeighborsClassifier(),
        DecisionTreeClassifier(random_state=42)
    ]

    best_model = None
    best_accuracy = 0

    for model in models:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred)

        print("Model:", type(model).__name__)
        print("Accuracy:", accuracy)
        print("Classification Report:\n", report)
        print("----------------------")

        # Step 6: Fine-tune the models (if necessary)
        if type(model).__name__ == 'RandomForestClassifier':
            param_grid = {
                'n_estimators': [50, 100, 150],
                'max_depth': [None, 5, 10],
                'min_samples_split': [2, 5, 10]
            }
            grid_search = GridSearchCV(model, param_grid, cv=5)
            grid_search.fit(X_train_scaled, y_train)

            best_params = grid_search.best_params_
            print("Best parameters:", best_params)

            # Re-train the model with the best parameters
            model = RandomForestClassifier(random_state=42, **best_params)
            model.fit(X_train_scaled, y_train)

        elif type(model).__name__ == 'SVC':
            param_grid = {
                'C': [0.1, 1, 10],
                'kernel': ['linear', 'rbf', 'sigmoid'],
                'gamma': ['scale', 'auto']
            }
            grid_search = GridSearchCV(model, param_grid, cv=5)
            grid_search.fit(X_train_scaled, y_train)

            best_params = grid_search.best_params_
            print("Best parameters:", best_params)

            # Re-train the model with the best parameters
            model = SVC(random_state=42, **best_params)
            model.fit(X_train_scaled, y_train)

        elif type(model).__name__ == 'KNeighborsClassifier':
            param_grid = {
                'n_neighbors': [3, 5, 7],
                'weights': ['uniform', 'distance'],
                'metric': ['euclidean', 'manhattan']
            }
            grid_search = GridSearchCV(model, param_grid, cv=5)
            grid_search.fit(X_train_scaled, y_train)

            best_params = grid_search.best_params_
            print("Best parameters:", best_params)

            # Re-train the model with the best parameters
            model = KNeighborsClassifier(**best_params)
            model.fit(X_train_scaled, y_train)

        elif type(model).__name__ == 'DecisionTreeClassifier':
            param_grid = {
                'max_depth': [None, 5, 10],
                'min_samples_split': [2, 5, 10]
            }
            grid_search = GridSearchCV(model, param_grid, cv=5)
            grid_search.fit(X_train_scaled, y_train)

            best_params = grid_search.best_params_
            print("Best parameters:", best_params)

            # Re-train the model with the best parameters
            model = DecisionTreeClassifier(random_state=42, **best_params)
            model.fit(X_train_scaled, y_train)

        if accuracy > best_accuracy:
            best_model = model
            best_accuracy = accuracy

        print("----------------------")

    # Step 8: Predict on new data
    new_data = pd.DataFrame({
        'Age': [21],
        'Height': [1.75],
        'Weight': [88],
        'FCVC': [2],
        'NCP': [3],
        'CH2O': [3],
        'FAF': [3],
        'TUE': [0]
    })

    new_data_scaled = scaler.transform(new_data)
    prediction = model.predict(new_data_scaled)
    print("Prediction:", prediction)

except Exception as e:
    print("An error occurred:", str(e))

Model: RandomForestClassifier
Accuracy: 0.950354609929078
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.98      0.99        56
           1       0.91      0.94      0.92        62
           2       0.88      0.88      0.88        56
           3       0.96      0.92      0.94        50
           4       0.96      0.96      0.96        78
           5       0.97      0.97      0.97        58
           6       0.98      1.00      0.99        63

    accuracy                           0.95       423
   macro avg       0.95      0.95      0.95       423
weighted avg       0.95      0.95      0.95       423

----------------------
Best parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 150}
----------------------
Model: SVC
Accuracy: 0.8652482269503546
Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.89      0.90        56
           1       0