PKL dan JSON

latih Model

In [55]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from joblib import dump
import json
import numpy as np
import pickle

# Load data dengan pengecekan error sederhana
try:
    df = pd.read_csv("gabungan_dataset_lengkap.csv")
except FileNotFoundError:
    print("File gabungan_dataset_lengkap.csv tidak ditemukan.")
    exit()

# Drop baris dengan target kosong
df.dropna(subset=['prognosis'], inplace=True)

# Pisahkan fitur dan label
X = df.drop(columns=['prognosis'])
y = df['prognosis']

# Encode fitur kategorikal (pastikan fitur kategorikal memang perlu encode)
for col in X.columns:
    if X[col].dtype == 'object':
        le_feat = LabelEncoder()
        X[col] = le_feat.fit_transform(X[col].astype(str))

# Encode label prognosis
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Simpan daftar fitur ke JSON
feature_columns = list(X.columns)
with open('feature_columns.json', 'w') as f:
    json.dump(feature_columns, f)

# Simpan mapping label ke JSON (mapping index ke label string)
label_mapping = {int(i): label for i, label in enumerate(le.classes_)}
with open('label_mapping.json', 'w') as f:
    json.dump(label_mapping, f)

# Split data train-test
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42
)

# Latih model RandomForest
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluasi model
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")

# Tampilkan classification report hanya untuk label yang muncul
unique_labels = np.unique(y_test)
target_names = [le.classes_[i] for i in unique_labels]
print(classification_report(y_test, y_pred, labels=unique_labels, target_names=target_names, zero_division=0))

# Simpan model dan label encoder juga sebagai .pkl dan .joblib (optional)
with open('randomforest_model.pkl', 'wb') as f:
    pickle.dump(model, f)
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(le, f)

# Simpan juga model dan encoder dengan joblib, kalau mau
dump(model, 'randomforest_model.joblib')
dump(le, 'label_encoder.joblib')

print("Model, label encoder, dan mapping label berhasil disimpan dalam file .pkl, .joblib, dan .json.")


Accuracy: 0.9763
                                             precision    recall  f1-score   support

                          Cedera ke pinggul       1.00      1.00      1.00         3
               Gangguan kelenjar salivarium       1.00      1.00      1.00         8
                         Gangguan metabolis       1.00      1.00      1.00         7
                      Gangguan nyeri kronis       0.97      0.95      0.96       134
                           Pankreatsis akut       0.97      0.98      0.98       111
                                atelektasis       1.00      1.00      1.00        22
                         atrophic vaginitis       0.93      0.99      0.96        67
                            cedera di lutut       0.93      1.00      0.96        13
                           cedera di tangan       0.97      0.97      0.97        31
                               cedera perut       1.00      1.00      1.00        10
                                   cirhosis    

In [72]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from collections import Counter
import json
import pickle

# Load data
try:
    df = pd.read_csv("gabungan_dataset_lengkap.csv")
except FileNotFoundError:
    print("File gabungan_dataset_lengkap.csv tidak ditemukan.")
    exit()

# Drop baris dengan target kosong
df.dropna(subset=['prognosis'], inplace=True)

# Hapus kelas dengan kurang dari 2 data supaya stratify bisa jalan
counts = df['prognosis'].value_counts()
rare_classes = counts[counts < 2].index
if len(rare_classes) > 0:
    print(f"Menghapus kelas dengan sangat sedikit data: {list(rare_classes)}")
    df = df[~df['prognosis'].isin(rare_classes)]

# Pisahkan fitur dan label
X = df.drop(columns=['prognosis'])
y = df['prognosis']

# Pastikan fitur numerik, encode fitur kategorikal jika ada
for col in X.columns:
    if X[col].dtype == 'object':
        le_feat = LabelEncoder()
        X[col] = le_feat.fit_transform(X[col].astype(str))

# Encode label prognosis
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Pastikan data numpy array
X_np = X.values

# Cek distribusi kelas
class_counts = Counter(y_encoded)
min_class_count = min(class_counts.values())
print(f"Jumlah data terkecil pada kelas: {min_class_count}")

if min_class_count <= 1:
    print("Tidak cukup data pada kelas minoritas setelah filter. Tambah data atau hapus kelas minoritas.")
    exit()

# Set k_neighbors SMOTE adaptif
k_neighbors = min(5, min_class_count - 1)
print(f"SMOTE akan menggunakan k_neighbors={k_neighbors}")

# Terapkan SMOTE oversampling
smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
X_resampled, y_resampled = smote.fit_resample(X_np, y_encoded)

print(f"Data setelah SMOTE, jumlah contoh: {len(y_resampled)}")
print(pd.Series(y_resampled).value_counts())

# Latih model sementara untuk feature importance
model_rf_temp = RandomForestClassifier(n_estimators=100, random_state=42)
model_rf_temp.fit(X_resampled, y_resampled)

importances = model_rf_temp.feature_importances_

# Pilih fitur penting yang importancenya >= median (atau ganti threshold sesuai kebutuhan)
threshold = np.median(importances)
important_indices = np.where(importances >= threshold)[0]
selected_features = X.columns[important_indices]

print(f"Memilih fitur penting sebanyak: {len(selected_features)}")

X_resampled_selected = X_resampled[:, important_indices]

# Split train-test dengan stratify
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled_selected, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
)

# Hyperparameter tuning RandomForest dengan RandomizedSearchCV
param_dist = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [None, 10, 20, 30, 40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

rf = RandomForestClassifier(random_state=42, n_jobs=-1)
random_search = RandomizedSearchCV(
    rf, param_distributions=param_dist, n_iter=30, cv=5,
    scoring='accuracy', random_state=42, n_jobs=-1, verbose=1
)
random_search.fit(X_train, y_train)

print(f"Best parameters: {random_search.best_params_}")
print(f"Best CV score: {random_search.best_score_:.4f}")

best_model = random_search.best_estimator_

# Evaluasi model pada test set
y_pred = best_model.predict(X_test)
print(f"Accuracy test set: {accuracy_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred, target_names=le.classes_, zero_division=0))

# Cross-validation score untuk stabilitas model
cv_scores = cross_val_score(best_model, X_resampled_selected, y_resampled, cv=5, n_jobs=-1)
print(f"Cross-validation accuracy scores: {cv_scores}")
print(f"Mean CV accuracy: {cv_scores.mean():.4f}")

# Simpan model, label encoder, dan fitur terpilih
with open('randomforest_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(le, f)

with open('selected_features.json', 'w') as f:
    json.dump(list(selected_features), f)

# Simpan metadata training
info = {
    "k_neighbors_smote": k_neighbors,
    "num_features_before": X.shape[1],
    "num_features_after": len(selected_features),
    "num_samples_before": len(y_encoded),
    "num_samples_after": len(y_resampled),
    "best_params": random_search.best_params_
}
with open('training_info.json', 'w') as f:
    json.dump(info, f)

print("Model, label encoder, fitur penting, dan metadata berhasil disimpan.")


Menghapus kelas dengan sangat sedikit data: ['sindrom turner']
Jumlah data terkecil pada kelas: 5
SMOTE akan menggunakan k_neighbors=4
Data setelah SMOTE, jumlah contoh: 35607
16    913
28    913
11    913
21    913
6     913
14    913
31    913
17    913
30    913
35    913
1     913
25    913
7     913
2     913
38    913
34    913
36    913
18    913
15    913
32    913
29    913
23    913
3     913
24    913
20    913
5     913
8     913
22    913
0     913
10    913
37    913
26    913
12    913
13    913
19    913
33    913
27    913
9     913
4     913
Name: count, dtype: int64
Memilih fitur penting sebanyak: 500
Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best parameters: {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_depth': 40, 'bootstrap': False}
Best CV score: 0.9929
Accuracy test set: 0.9914
                                             precision    recall  f1-score   support

                          Cedera ke pinggul       1.

In [80]:
import json
import pandas as pd
import pickle

# Load model dan label encoder dari pkl
with open('randomforest_model.pkl', 'rb') as f:
    model = pickle.load(f)

with open('label_encoder.pkl', 'rb') as f:
    le = pickle.load(f)

# Load fitur kolom penting yang dipakai model (hasil feature selection)
with open('selected_features.json', 'r') as f:
    feature_columns = json.load(f)

answer = input("Apakah Anda ingin memasukkan gejala? (Ya/Tidak): ").strip().lower()

if answer == 'ya':
    print("\nPilih gejala yang Anda alami:")
    for i, feat in enumerate(feature_columns, 1):
        print(f"{i}. {feat}")

    input_str = input("\nMasukkan nomor gejala (pisahkan dengan koma): ")
    selected_indices = [int(x.strip()) - 1 for x in input_str.split(",") if x.strip().isdigit()]

    # Buat dict input dengan default 0
    input_dict = {feat: 0 for feat in feature_columns}
    for idx in selected_indices:
        if 0 <= idx < len(feature_columns):
            input_dict[feature_columns[idx]] = 1

    input_vector = [input_dict[col] for col in feature_columns]
    input_df = pd.DataFrame([input_vector], columns=feature_columns)

    # Prediksi
    pred_encoded = model.predict(input_df)[0]
    pred_label = le.inverse_transform([pred_encoded])[0]

    print("\n=== Hasil Prediksi Penyakit ===")
    print(f"Penyakit yang diprediksi: {pred_label.upper()}")

else:
    print("Baik, tidak ada input gejala diberikan. Program selesai.")


KeyboardInterrupt: Interrupted by user

Prediksi

In [81]:
import json
import pandas as pd
import pickle

# Load model dan label encoder dari pkl
with open('randomforest_model.pkl', 'rb') as f:
    model = pickle.load(f)

with open('label_encoder.pkl', 'rb') as f:
    le = pickle.load(f)

# Load fitur kolom dari JSON
with open('feature_columns.json', 'r') as f:
    feature_columns = json.load(f)

# Tanya user mau input gejala atau tidak
answer = input("Apakah Anda ingin memasukkan gejala? (Ya/Tidak): ").strip().lower()

if answer == 'ya':
    print("\nPilih gejala yang Anda alami:")
    for i, feat in enumerate(feature_columns, 1):
        print(f"{i}. {feat}")

    input_str = input("\nMasukkan nomor gejala (pisahkan dengan koma): ")
    selected_indices = [int(x.strip()) - 1 for x in input_str.split(",") if x.strip().isdigit()]

    # Buat dictionary fitur dengan nilai default 0
    input_dict = {feat: 0 for feat in feature_columns}
    for idx in selected_indices:
        if 0 <= idx < len(feature_columns):
            input_dict[feature_columns[idx]] = 1

    # Buat dataframe dari input
    input_vector = [input_dict[col] for col in feature_columns]
    input_df = pd.DataFrame([input_vector], columns=feature_columns)

    # Prediksi
    predicted_class_index = model.predict(input_df)[0]
    predicted_label = le.inverse_transform([predicted_class_index])[0]

    print("\n=== Hasil Prediksi Penyakit ===")
    print(f"Penyakit yang diprediksi: {predicted_label.upper()}")

else:
    print("Baik, tidak ada input gejala diberikan. Program selesai.")



Pilih gejala yang Anda alami:
1. gelisah dan gugup
2. depresi
3. Kekeringan napas
4. depresif atau gejala psikotik
5. nyeri dada tajam
6. pusing
7. insomnia
8. gerakan abnormal tidak disengaja
9. dada tilf
10. papitasi
11. detak jantung tidak teratur
12. bernafas dengan cepat
13. suara serak
14. sakit tenggorokan
15. kesulitan berbicara
16. batuk
17. kemacetan hidung
18. tenggorokan pembengkakan
19. pendengaran berkurang
20. benjolan di tenggorokan
21. Tenggorokan terasa ketat
22. kesulitan menelan
23. pembengkakan kulit
24. retensi urin
25. groin massa
26. sakit kaki
27. hip pain
28. nyeri suprapubic
29. darah di bangku
30. kekurangan pertumbuhan
31. gejala emosional
32. kelemahan siku
33. kembali kelemahan
34. pus dalam sputum
35. gejala skrotum dan testis
36. pembengkakan skrotum
37. nyeri di testis
38. datar
39. nanah dari telinga
40. kuning
41. massa dalam skrotum
42. putih debit dari mata
43. bayi yang mudah marah
44. penyalahgunaan alkohol
45. pingsan
46. perilaku bermusuhan
47

