In [177]:
!pip install xgboost



In [217]:
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
import numpy as np


## **1. Load Data**

In [218]:
# 1. Load Data
try:
    df_train = pd.read_excel('/content/train_data.xlsx')
    df_test = pd.read_excel('/content/test_data.xlsx')
    print("Data berhasil dimuat!")
except FileNotFoundError:
    print("File train_data.xlsx atau test_data.xlsx tidak ditemukan.")
    exit()
except Exception as e:
    print(f"Terjadi kesalahan saat memuat data: {e}")
    exit()

Data berhasil dimuat!


## **2. Prepocessing Data**

In [233]:
# Simpan salinan data test asli sebelum preprocessing untuk output akhir
df_test_original = df_test.copy()

In [219]:
#check Nun
df_train = df_train.dropna()

In [220]:
# Encoding variabel kategorikal
categorical_features = df_train.select_dtypes(include=['object']).columns
for col in categorical_features:
    df_train[col] = df_train[col].astype('category')
    df_train[col] = df_train[col].cat.codes

In [221]:
# Pemisahan fitur dan target
X = df_train.drop('Status BMI', axis=1)
y = df_train['Status BMI']

In [222]:
# Identifikasi kolom kategorikal dan numerikal
categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(include=['number']).columns

In [223]:
# Buat Preprocessing Pipelines
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('numerical', numerical_pipeline, numerical_features),
    ('categorical', categorical_pipeline, categorical_features)],
    remainder='passthrough'
)

In [224]:
# Fit preprocessor pada data latih
preprocessor.fit(X)

In [225]:
# Transform data latih
X_train = preprocessor.transform(X)

In [226]:
# Encoding variabel kategorikal pada data test
categorical_features_test = df_test.select_dtypes(include=['object']).columns
for col in categorical_features_test:
    df_test[col] = df_test[col].astype('category')
    df_test[col] = df_test[col].cat.codes

In [227]:
X_test_processed = preprocessor.transform(df_test)

## **3.  Pembuatan Model & Evaluasi Model**

In [188]:
# Transform data uji
X_test_processed = preprocessor.transform(df_test)

In [202]:
# Evaluasi Model
def evaluate_model(model, X, y, model_name):
    cv = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
    print(f"Akurasi Cross-Validation {model_name}: {np.mean(scores):.4f} +/- {np.std(scores):.4f}")
    return model  # Mengembalikan model yang sudah dilatih

In [208]:
#GausianNb
pipeline_nb = Pipeline([
    ('model', GaussianNB())
])

#DesicionThree
pipeline_dt = Pipeline([
    ('model', DecisionTreeClassifier(random_state=42))
])

#XGBCClassifier
pipeline_xgb = Pipeline([
    ('model', XGBClassifier(random_state=42))
])

In [206]:
# Evaluasi Model dengan Cross-Validation
def evaluate_model(pipeline, X, y, model_name):
    cv = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y, cv=cv, scoring='accuracy')
    print(f"Akurasi Cross-Validation {model_name}: {np.mean(scores):.4f} +/- {np.std(scores):.4f}")
    return np.mean(scores)

## **4.  Pilih Model**

In [209]:
# Evaluasi semua model
accuracy_nb = evaluate_model(pipeline_nb, X_train, y, "Naive Bayes")
accuracy_dt = evaluate_model(pipeline_dt, X_train, y, "Decision Tree")
accuracy_xgb = evaluate_model(pipeline_xgb, X_train, y, "XGBoost")

Akurasi Cross-Validation Naive Bayes: 0.5352 +/- 0.0360
Akurasi Cross-Validation Decision Tree: 0.9320 +/- 0.0221
Akurasi Cross-Validation XGBoost: 0.9726 +/- 0.0085


In [235]:
print("\n=== Pemilihan Model Terbaik ===")
if accuracy_nb >= accuracy_dt and accuracy_nb >= accuracy_xgb:
    best_model = pipeline_nb
    print("Model terbaik: Naive Bayes")
elif accuracy_dt >= accuracy_nb and accuracy_dt >= accuracy_xgb:
    best_model = pipeline_dt
    print("Model terbaik: Decision Tree")
else:
    best_model = pipeline_xgb
    print("Model terbaik: XGBoost")


=== Pemilihan Model Terbaik ===
Model terbaik: XGBoost


jadi best modelnya yaitu XGBoost karena nilai presentasi hasil akurasi modelnya 0.9726 dan Standar Deviasi -0.0085

## **5.  Prediksi pada Data Test**

In [236]:
# Fit model terbaik pada seluruh data training
best_model.fit(X_train, y)

In [237]:
# Prediksi pada data test
predictions = best_model.predict(X_test_processed)

## **6. Menyimpan Hasil***

In [238]:
# Tampilkan hasil prediksi
print("\nHasil Prediksi pada Data Test:")
print(predictions)


Hasil Prediksi pada Data Test:
[2 2 4 0 2 6 1 3 4 1 3 4 5 4 2 2 1 4 1 4 3 5 1 3 2 4 4 3 0 0 2 1 4 6 1 5 0
 4 4 2 6 4 5 6 1 6 6 0 3 1 5 1 4 2 2 6 2 3 5 5 4 1 5 2 5 4 5 5 5 4 6 5 2 3
 4 5 4 4 4 5 6 6 1 2 6 1 5 0 4 0 2 2 1 1 5 1 1 6 6 4 2 1 0 1 3 0 4 3 2 3 3
 3 4 5 5 5 4 2 2 0 4 5 2 3 6 5 0 5 2 4 2 4 1 1 0 2 6 3 6 1 6 2 4 6 6 4 2 2
 3 5 0 5 1 2 0 6 0 1 0 6 5 4 4 2 0 3 5 0 0 5 1 2 5 6 4 5 6 5 3 5 4 6 4 2 3
 0 2 2 6 0 2 5 1 4 5 6 2 0 2 1 1 4 3 1 3 2 3 6 6 2 1 5 5 3 5 4 0 3 2 2 0 5
 0 5 4 3 2 4 6 2 4 4 1 5 3 5 6 3 0 3 4 2 0 3 6 4 0 2 3 2 2 6 2 3 6 1 4 0 6
 0 1 5 0 1 5 1 0 1 0 1 1 2 0 4 3 5 1 0 0 4 4 1 6 6 4 5 4 1 5 6 3 4 2 6 5 3
 2 4 6 4 6 6 2 2 5 1 2 1 5 3 5 5 1 4 2 1 6 3 4 6 0 1 1 5 6 6 0 6 6 3 2 1 0
 6 1 6 5 4 1 6 5 2 0 2 6 1 3 2 6 3 4 5 4 2 5 5 4 1 4 5 0 0 5 0 3 3 3 0 0 2
 2 5 2 2 3 2 5 3 5 3 4 2 1 0 1 5 4 2 6 5 3 1 3 2 2 1 0 6 2 4 4 5 2 3 2 4 3
 2 3 4 3 3 5 3 5 3 5 2 4 4]


In [239]:
# Tambahkan hasil prediksi ke DataFrame data test asli
df_test_original['Status BMI (Prediksi)'] = predictions

In [240]:
df_test_original.to_csv('hasil_prediksi_lengkap.csv', index=False)
print("\nHasil prediksi telah disimpan dalam file hasil_prediksi_lengkap.csv")
print("File berisi data X asli beserta hasil prediksi Y")


Hasil prediksi telah disimpan dalam file hasil_prediksi_lengkap.csv
File berisi data X asli beserta hasil prediksi Y


In [242]:
predictions_df = pd.DataFrame({'Status BMI': predictions})
predictions_df.to_csv('hasil_prediksi.csv', index=False)
print("File prediksi Y saja juga telah disimpan dalam hasil_prediksi.csv")

File prediksi Y saja juga telah disimpan dalam hasil_prediksi.csv
