In [16]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, log_loss


In [17]:
train = pd.read_csv("/kaggle/input/mock-test-2-mse-2/train.csv")
test  = pd.read_csv("/kaggle/input/mock-test-2-mse-2/test.csv")

In [18]:
print(train.isna().sum())
print(test.isna().sum())

id                  0
N_Days              0
Drug             6541
Age                 0
Sex                 0
Ascites          6540
Hepatomegaly     6543
Spiders          6552
Edema               0
Bilirubin           0
Cholesterol      8375
Albumin             0
Copper           6657
Alk_Phos         6547
SGOT             6551
Tryglicerides    8422
Platelets         582
Prothrombin        19
Stage               0
Status              0
dtype: int64
id                  0
N_Days              0
Drug             4300
Age                 0
Sex                 0
Ascites          4296
Hepatomegaly     4300
Spiders          4303
Edema               0
Bilirubin           0
Cholesterol      5558
Albumin             0
Copper           4372
Alk_Phos         4304
SGOT             4304
Tryglicerides    5578
Platelets         357
Prothrombin        14
Stage               0
dtype: int64


In [19]:
cat_cols = train.select_dtypes(include=['object']).columns.tolist()
if "Status" in cat_cols:
    cat_cols.remove("Status")

In [20]:
num_cols = train.select_dtypes(include=['int64', 'float64']).columns

In [21]:
for col in cat_cols:
    train[col] = train[col].fillna(train[col].mode()[0])
    if col in test.columns:
        test[col]  = test[col].fillna(test[col].mode()[0])

In [22]:
for col in num_cols:
    train[col] = train[col].fillna(train[col].mean())
    if col in test.columns:
        test[col]  = test[col].fillna(test[col].mean())

In [23]:
le_dict = {}

for col in cat_cols:
    le = LabelEncoder()

    combined = pd.concat([train[col].astype(str),
                          test[col].astype(str)], axis=0)

    le.fit(combined)

    train[col] = le.transform(train[col].astype(str))
    test[col]  = le.transform(test[col].astype(str))

    le_dict[col] = le

In [24]:
X = train.drop(columns=["Status"])
y = train["Status"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [25]:
model = KNeighborsClassifier(
    n_neighbors=30,
    weights="distance"
)

model.fit(X_train, y_train)

In [26]:
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)

print("Validation Accuracy:", accuracy_score(y_test, y_pred))
print("Validation Log Loss:", log_loss(y_test, y_pred_proba))

Validation Accuracy: 0.759
Validation Log Loss: 0.7558486298799435


In [27]:
final_pred_proba = model.predict_proba(test)

In [28]:
submission = pd.DataFrame({
    "id": test["id"],
    "Status_C":  final_pred_proba[:, 0],
    "Status_CL": final_pred_proba[:, 1],
    "Status_D":  final_pred_proba[:, 2]
})

submission.to_csv("/kaggle/working/Answer.csv", index=False)
submission.head()

Unnamed: 0,id,Status_C,Status_CL,Status_D
0,15000,0.780111,0.0,0.219889
1,15001,1.0,0.0,0.0
2,15002,0.601342,0.0,0.398658
3,15003,0.67948,0.022299,0.29822
4,15004,0.81324,0.058007,0.128753
