In [1]:
# 1. Import Library
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from collections import Counter
import json
import joblib

# 2. Load Data
try:
    df = pd.read_csv("gabungan_dataset_lengkap.csv")
    print("File loaded successfully.")
except FileNotFoundError:
    print("File gabungan_dataset_lengkap.csv tidak ditemukan.")
    exit()

# 3. Data Understanding
print("\n--- Info Dataset ---")
print(df.info())
print("\n--- Statistik Deskriptif ---")
print(df.describe(include='all'))
print("\n--- Nilai Null per Kolom ---")
print(df.isnull().sum())
print("\n--- Contoh Data ---")
print(df.head())

target_col = 'prognosis'

print("\n--- Distribusi Kelas (sebelum pembersihan) ---")
print(df[target_col].value_counts(dropna=False))

# 4. Data Preparation
df.dropna(inplace=True)
print(f"\nData setelah hapus baris NaN, shape: {df.shape}")

df.dropna(subset=[target_col], inplace=True)

counts = df[target_col].value_counts()
rare_classes = counts[counts < 2].index
if len(rare_classes) > 0:
    print(f"Menghapus kelas dengan sangat sedikit data: {list(rare_classes)}")
    df = df[~df[target_col].isin(rare_classes)]

print("\n--- Distribusi Kelas (setelah pembersihan) ---")
print(df[target_col].value_counts())

X = df.drop(columns=[target_col])
y = df[target_col]

for col in X.columns:
    if X[col].dtype == 'object':
        le_feat = LabelEncoder()
        X[col] = le_feat.fit_transform(X[col].astype(str))

le = LabelEncoder()
y_encoded = le.fit_transform(y)

X_np = X.values

class_counts = Counter(y_encoded)
min_class_count = min(class_counts.values())
print(f"Jumlah data terkecil pada kelas: {min_class_count}")

if min_class_count <= 1:
    print("Tidak cukup data pada kelas minoritas setelah filter. Tambah data atau hapus kelas minoritas.")
    exit()

k_neighbors = min(5, min_class_count - 1)
print(f"SMOTE akan menggunakan k_neighbors={k_neighbors}")

smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
X_resampled, y_resampled = smote.fit_resample(X_np, y_encoded)

print(f"Data setelah SMOTE, jumlah contoh: {len(y_resampled)}")
print(pd.Series(y_resampled).value_counts())

model_rf_temp = RandomForestClassifier(n_estimators=100, random_state=42)
model_rf_temp.fit(X_resampled, y_resampled)

importances = model_rf_temp.feature_importances_
threshold = np.median(importances)
important_indices = np.where(importances >= threshold)[0]
selected_features = X.columns[important_indices]

print(f"Memilih fitur penting sebanyak: {len(selected_features)} dari total {X.shape[1]} fitur.")

X_resampled_selected = X_resampled[:, important_indices]

X_train, X_test, y_train, y_test = train_test_split(
    X_resampled_selected, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
)

# 5. Modelling
param_dist = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [None, 10, 20, 30, 40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

rf = RandomForestClassifier(random_state=42, n_jobs=-1)
random_search = RandomizedSearchCV(
    rf, param_distributions=param_dist, n_iter=30, cv=5,
    scoring='accuracy', random_state=42, n_jobs=-1, verbose=1
)
random_search.fit(X_train, y_train)

print(f"\nBest parameters: {random_search.best_params_}")
print(f"Best CV score: {random_search.best_score_:.4f}")

best_model = random_search.best_estimator_

# 6. Evaluation
y_pred = best_model.predict(X_test)

print(f"\nAccuracy test set: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_, zero_division=0))

cv_scores = cross_val_score(best_model, X_resampled_selected, y_resampled, cv=5, n_jobs=-1)
print(f"Cross-validation accuracy scores: {cv_scores}")
print(f"Mean CV accuracy: {cv_scores.mean():.4f}")

# 7. Save model, label encoder, and selected features using joblib
joblib.dump(best_model, 'randomforest_model.joblib')
joblib.dump(le, 'label_encoder.joblib')

with open('selected_features.json', 'w') as f:
    json.dump(list(selected_features), f)

info = {
    "k_neighbors_smote": k_neighbors,
    "num_features_before": X.shape[1],
    "num_features_after": len(selected_features),
    "num_samples_before": len(y_encoded),
    "num_samples_after": len(y_resampled),
    "best_params": random_search.best_params_
}

with open('training_info.json', 'w') as f:
    json.dump(info, f)

print("\nModel, label encoder, fitur penting, dan metadata berhasil disimpan dengan joblib.")


File loaded successfully.

--- Info Dataset ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11828 entries, 0 to 11827
Columns: 501 entries, gelisah dan gugup to sensasi_berputar
dtypes: float64(1), int64(499), object(1)
memory usage: 45.2+ MB
None

--- Statistik Deskriptif ---
        gelisah dan gugup       depresi  Kekeringan napas  \
count        11828.000000  11828.000000      11828.000000   
unique                NaN           NaN               NaN   
top                   NaN           NaN               NaN   
freq                  NaN           NaN               NaN   
mean             0.059689      0.062817          0.098664   
std              0.236920      0.242644          0.298223   
min              0.000000      0.000000          0.000000   
25%              0.000000      0.000000          0.000000   
50%              0.000000      0.000000          0.000000   
75%              0.000000      0.000000          0.000000   
max              1.000000      1.000000      

In [5]:
import json
import pandas as pd
import joblib

# Load model dan label encoder dari joblib
model = joblib.load('randomforest_model.joblib')
le = joblib.load('label_encoder.joblib')

# Load fitur kolom dari JSON
with open('selected_features.json', 'r') as f:
    feature_columns = json.load(f)

# Tanya user mau input gejala atau tidak
answer = input("Apakah Anda ingin memasukkan gejala? (Ya/Tidak): ").strip().lower()

if answer == 'ya':
    print("\nPilih gejala yang Anda alami:")
    for i, feat in enumerate(feature_columns, 1):
        print(f"{i}. {feat}")

    input_str = input("\nMasukkan nomor gejala (pisahkan dengan koma): ")
    selected_indices = [int(x.strip()) - 1 for x in input_str.split(",") if x.strip().isdigit()]

    # Buat dictionary fitur dengan nilai default 0
    input_dict = {feat: 0 for feat in feature_columns}
    for idx in selected_indices:
        if 0 <= idx < len(feature_columns):
            input_dict[feature_columns[idx]] = 1

    # Buat dataframe dari input
    input_vector = [input_dict[col] for col in feature_columns]
    input_df = pd.DataFrame([input_vector], columns=feature_columns)

    # Prediksi
    predicted_class_index = model.predict(input_df)[0]
    predicted_label = le.inverse_transform([predicted_class_index])[0]

    print("\n=== Hasil Prediksi Penyakit ===")
    print(f"Penyakit yang diprediksi: {predicted_label.upper()}")

else:
    print("Baik, tidak ada input gejala diberikan. Program selesai.")



Pilih gejala yang Anda alami:
1. gelisah dan gugup
2. depresi
3. Kekeringan napas
4. depresif atau gejala psikotik
5. nyeri dada tajam
6. pusing
7. insomnia
8. gerakan abnormal tidak disengaja
9. dada tilf
10. papitasi
11. detak jantung tidak teratur
12. bernafas dengan cepat
13. suara serak
14. sakit tenggorokan
15. kesulitan berbicara
16. batuk
17. kemacetan hidung
18. tenggorokan pembengkakan
19. pendengaran berkurang
20. benjolan di tenggorokan
21. Tenggorokan terasa ketat
22. kesulitan menelan
23. pembengkakan kulit
24. retensi urin
25. groin massa
26. sakit kaki
27. hip pain
28. nyeri suprapubic
29. darah di bangku
30. kekurangan pertumbuhan
31. gejala emosional
32. kelemahan siku
33. kembali kelemahan
34. pus dalam sputum
35. gejala skrotum dan testis
36. pembengkakan skrotum
37. nyeri di testis
38. datar
39. nanah dari telinga
40. kuning
41. massa dalam skrotum
42. putih debit dari mata
43. bayi yang mudah marah
44. penyalahgunaan alkohol
45. pingsan
46. perilaku bermusuhan
47

