In [3]:
# Data manipulation and preprocessing
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

# Model selection and cross-validation
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold

# Machine learning models
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier

# Evaluation metrics
from sklearn.metrics import make_scorer, accuracy_score, recall_score, f1_score

# Utilities
import numpy as np


# Load your dataset (replace 'dataset.csv' with the actual file path)
df = pd.read_csv('train.csv')

# Step 1: Encode 'Gender' (Binary encoding)
df['Gender'] = df['Gender'].apply(lambda x: 1 if x == 'Male' else 0)

# Step 2: Encode 'family_history_with_overweight', 'FAVC', 'SMOKE', 'SCC' (Binary encoding)
binary_columns = ['family_history_with_overweight', 'FAVC', 'SMOKE', 'SCC']
for col in binary_columns:
    df[col] = df[col].apply(lambda x: 1 if x == 'yes' else 0)

# Step 3: Ordinal encoding for 'FCVC', 'NCP', 'FAF', 'TUE'
# These are already numeric, so no transformation is necessary, but ensure they are integers
ordinal_columns = ['FCVC', 'NCP', 'FAF', 'TUE']
df[ordinal_columns] = df[ordinal_columns].astype(int)

# Step 4: One-hot encode 'CAEC', 'CALC', 'MTRANS' (One-hot encoding for categorical with multiple levels)
categorical_columns = ['CAEC', 'CALC', 'MTRANS']
df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

# Step 5: Feature Engineering - Calculate BMI
df['BMI'] = df['Weight'] / (df['Height'] ** 2)

# Step 6: Normalize or standardize continuous features (Age, Height, Weight, CH2O, BMI)
scaler = StandardScaler()
continuous_columns = ['Age', 'Height', 'Weight', 'CH2O', 'BMI']
df[continuous_columns] = scaler.fit_transform(df[continuous_columns])

# Step 7: Encode target variable 'NObeyesdad'
# Use LabelEncoder if it’s ordinal, or OneHotEncoder if not. Here we use LabelEncoder for simplicity.
label_encoder = LabelEncoder()
df['NObeyesdad'] = label_encoder.fit_transform(df['NObeyesdad'])

# Step 8: Split into features and target variable
X = df.drop('NObeyesdad', axis=1)
y = df['NObeyesdad']

# Step 9: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Final result
print("Training features shape:", X_train.shape)
print("Testing features shape:", X_test.shape)
print("Training labels shape:", y_train.shape)
print("Testing labels shape:", y_test.shape)

scoring_metrics = {
    'accuracy': make_scorer(accuracy_score),
    'recall': make_scorer(recall_score, average='weighted'),  # weighted for class imbalance
    'f1': make_scorer(f1_score, average='weighted')
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize models
models = {
    'SVM': SVC(),
    'Random Forest': RandomForestClassifier(random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42)
}

# Dictionary to store results
model_results = {}

# Evaluate each model using cross-validation
for model_name, model in models.items():
    model_metrics = {}
    for metric_name, scorer in scoring_metrics.items():
        scores = cross_val_score(model, X, y, scoring=scorer, cv=cv)
        model_metrics[metric_name] = (np.mean(scores), np.std(scores))
    model_results[model_name] = model_metrics

# Display the results
for model_name, metrics in model_results.items():
    print(f"\nModel: {model_name}")
    for metric, (mean, std) in metrics.items():
        print(f"{metric.capitalize()}: {mean:.3f} ± {std:.3f}")


Training features shape: (1688, 24)
Testing features shape: (423, 24)
Training labels shape: (1688,)
Testing labels shape: (423,)


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.




Model: SVM
Accuracy: 0.946 ± 0.005
Recall: 0.946 ± 0.005
F1: 0.946 ± 0.004

Model: Random Forest
Accuracy: 0.984 ± 0.005
Recall: 0.984 ± 0.005
F1: 0.984 ± 0.005

Model: XGBoost
Accuracy: 0.985 ± 0.006
Recall: 0.985 ± 0.006
F1: 0.985 ± 0.005

Model: Decision Tree
Accuracy: 0.969 ± 0.010
Recall: 0.969 ± 0.010
F1: 0.969 ± 0.010


Preprocessing of the data - data loading , data parsing

Feature selection and engineering - BMI need to add a new column and calculate the BMI as stated in proposal

Choose algo XGBoost algo , SVM, Random Forest , Gradient Boosting Classifier , Decision Tree Classifier

data Split - 80 -20, Main evaluation metric for this problem - F1 score , recall and accuracy