# Description
This model was developed by an amateur data scientist. If you have any critiques or suggestions, please let me know. I would love any good feedback from you. I am still learning in this section. Thank you!

# Import Library

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.tree import ExtraTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/playground-series-s3e26/sample_submission.csv
/kaggle/input/playground-series-s3e26/train.csv
/kaggle/input/playground-series-s3e26/test.csv


# Data Wrangling (Train Data)
This section is about gathering data, assessing data, and cleaning data.

In [2]:
train_df = pd.read_csv("/kaggle/input/playground-series-s3e26/train.csv")
train_df.head()

Unnamed: 0,id,N_Days,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage,Status
0,0,999,D-penicillamine,21532,M,N,N,N,N,2.3,316.0,3.35,172.0,1601.0,179.8,63.0,394.0,9.7,3.0,D
1,1,2574,Placebo,19237,F,N,N,N,N,0.9,364.0,3.54,63.0,1440.0,134.85,88.0,361.0,11.0,3.0,C
2,2,3428,Placebo,13727,F,N,Y,Y,Y,3.3,299.0,3.55,131.0,1029.0,119.35,50.0,199.0,11.7,4.0,D
3,3,2576,Placebo,18460,F,N,N,N,N,0.6,256.0,3.5,58.0,1653.0,71.3,96.0,269.0,10.7,3.0,C
4,4,788,Placebo,16658,F,N,Y,N,N,1.1,346.0,3.65,63.0,1181.0,125.55,96.0,298.0,10.6,4.0,C


In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7905 entries, 0 to 7904
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             7905 non-null   int64  
 1   N_Days         7905 non-null   int64  
 2   Drug           7905 non-null   object 
 3   Age            7905 non-null   int64  
 4   Sex            7905 non-null   object 
 5   Ascites        7905 non-null   object 
 6   Hepatomegaly   7905 non-null   object 
 7   Spiders        7905 non-null   object 
 8   Edema          7905 non-null   object 
 9   Bilirubin      7905 non-null   float64
 10  Cholesterol    7905 non-null   float64
 11  Albumin        7905 non-null   float64
 12  Copper         7905 non-null   float64
 13  Alk_Phos       7905 non-null   float64
 14  SGOT           7905 non-null   float64
 15  Tryglicerides  7905 non-null   float64
 16  Platelets      7905 non-null   float64
 17  Prothrombin    7905 non-null   float64
 18  Stage   

In [4]:
print("Null values = \n", train_df.isna().sum())

Null values = 
 id               0
N_Days           0
Drug             0
Age              0
Sex              0
Ascites          0
Hepatomegaly     0
Spiders          0
Edema            0
Bilirubin        0
Cholesterol      0
Albumin          0
Copper           0
Alk_Phos         0
SGOT             0
Tryglicerides    0
Platelets        0
Prothrombin      0
Stage            0
Status           0
dtype: int64


In [5]:
print("Duplicated data =",train_df.duplicated().sum())

Duplicated data = 0


In [6]:
train_df.describe()

Unnamed: 0,id,N_Days,Age,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
count,7905.0,7905.0,7905.0,7905.0,7905.0,7905.0,7905.0,7905.0,7905.0,7905.0,7905.0,7905.0,7905.0
mean,3952.0,2030.173308,18373.14649,2.594485,350.561923,3.548323,83.902846,1816.74525,114.604602,115.340164,265.228969,10.629462,3.032511
std,2282.121272,1094.233744,3679.958739,3.81296,195.379344,0.346171,75.899266,1903.750657,48.790945,52.530402,87.465579,0.781735,0.866511
min,0.0,41.0,9598.0,0.3,120.0,1.96,4.0,289.0,26.35,33.0,62.0,9.0,1.0
25%,1976.0,1230.0,15574.0,0.7,248.0,3.35,39.0,834.0,75.95,84.0,211.0,10.0,2.0
50%,3952.0,1831.0,18713.0,1.1,298.0,3.58,63.0,1181.0,108.5,104.0,265.0,10.6,3.0
75%,5928.0,2689.0,20684.0,3.0,390.0,3.77,102.0,1857.0,137.95,139.0,316.0,11.0,4.0
max,7904.0,4795.0,28650.0,28.0,1775.0,4.64,588.0,13862.4,457.25,598.0,563.0,18.0,4.0


In [7]:
drug_labels = train_df['Drug'].unique().tolist()
sex_labels = train_df['Sex'].unique().tolist()
ascites_labels = train_df['Ascites'].unique().tolist()
hepatomegaly_labels = train_df['Hepatomegaly'].unique().tolist()
spiders_labels = train_df['Spiders'].unique().tolist()
edema_labels = train_df['Edema'].unique().tolist()

In [8]:
train_df["Drug"] = train_df["Drug"].apply(lambda x: drug_labels.index(x))
train_df["Sex"] = train_df["Sex"].apply(lambda x: sex_labels.index(x))
train_df["Ascites"] = train_df["Ascites"].apply(lambda x: ascites_labels.index(x))
train_df["Hepatomegaly"] = train_df["Hepatomegaly"].apply(lambda x: hepatomegaly_labels.index(x))
train_df["Spiders"] = train_df["Spiders"].apply(lambda x: spiders_labels.index(x))
train_df["Edema"] = train_df["Edema"].apply(lambda x: edema_labels.index(x))

In [9]:
train_df.head()

Unnamed: 0,id,N_Days,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage,Status
0,0,999,0,21532,0,0,0,0,0,2.3,316.0,3.35,172.0,1601.0,179.8,63.0,394.0,9.7,3.0,D
1,1,2574,1,19237,1,0,0,0,0,0.9,364.0,3.54,63.0,1440.0,134.85,88.0,361.0,11.0,3.0,C
2,2,3428,1,13727,1,0,1,1,1,3.3,299.0,3.55,131.0,1029.0,119.35,50.0,199.0,11.7,4.0,D
3,3,2576,1,18460,1,0,0,0,0,0.6,256.0,3.5,58.0,1653.0,71.3,96.0,269.0,10.7,3.0,C
4,4,788,1,16658,1,0,1,0,0,1.1,346.0,3.65,63.0,1181.0,125.55,96.0,298.0,10.6,4.0,C


In [10]:
trainStatusClass_df = pd.get_dummies(train_df['Status'],dtype=int).reset_index()
trainStatusClass_df.head()

Unnamed: 0,index,C,CL,D
0,0,0,0,1
1,1,1,0,0
2,2,0,0,1
3,3,1,0,0
4,4,1,0,0


In [11]:
trainStatusClass_df = trainStatusClass_df.rename(columns={'index':'id'})
trainStatusClass_df.tail()

Unnamed: 0,id,C,CL,D
7900,7900,1,0,0
7901,7901,1,0,0
7902,7902,0,0,1
7903,7903,0,0,1
7904,7904,1,0,0


In [12]:
trainStatusClass_df = trainStatusClass_df.drop(['id'], axis=1)
trainStatusClass_df.head(2)

Unnamed: 0,C,CL,D
0,0,0,1
1,1,0,0


# Train the Data
This section is about making models and training data. I divided the target into three different data frames based on three classes. I made predictions for three different classes, C, CL, and D. I used a Bagging Classifier with Extra Tree Classifier estimator.

In [13]:
# Divided status dataframe into three different dataframe
trainStatusClassC_df = trainStatusClass_df['C']
trainStatusClassCL_df = trainStatusClass_df['CL']
trainStatusClassD_df = trainStatusClass_df['D']

In [14]:
# Prediction to C status
xC = train_df.drop(['id', 'Status'], axis=1)
yC = trainStatusClassC_df

print(xC.shape)
print(yC.shape)

(7905, 18)
(7905,)


In [15]:
xC_train, xC_test, yC_train, yC_test = train_test_split(xC, yC, test_size=0.3, random_state=42)

In [16]:
model_C = BaggingClassifier(estimator=ExtraTreeClassifier(max_depth=2, random_state=10), n_estimators=100)
model_C.fit(xC_train, yC_train)

In [17]:
scores_C = cross_val_score(model_C, xC_train, yC_train, cv=5)
print("cross_val_score mean =", scores_C.mean())
yC_pred = model_C.predict(xC_test)
print("accuracy_score =", accuracy_score(yC_test, yC_pred))
print("score =", model_C.score(xC_test, yC_test))

cross_val_score mean = 0.7572735395828943
accuracy_score = 0.7407251264755481
score = 0.7407251264755481


In [18]:
# Prediction to CL status
xCL = train_df.drop(['id', 'Status'], axis=1)
yCL = trainStatusClassCL_df

print(xCL.shape)
print(yCL.shape)

(7905, 18)
(7905,)


In [19]:
xCL_train, xCL_test, yCL_train, yCL_test = train_test_split(xCL, yCL, test_size=0.3, random_state=42)

In [20]:
model_CL = BaggingClassifier(estimator=ExtraTreeClassifier(max_depth=2, random_state=10), n_estimators=100)
model_CL.fit(xCL_train, yCL_train)

In [21]:
scores_CL = cross_val_score(model_CL, xCL_train, yCL_train, cv=5)
print("cross_val_score mean =", scores_CL.mean())
yCL_pred = model_CL.predict(xCL_test)
print("accuracy_score =", accuracy_score(yCL_test, yCL_pred))
print("score =", model_CL.score(xCL_test, yCL_test))

cross_val_score mean = 0.9652992382847276
accuracy_score = 0.965008431703204
score = 0.965008431703204


In [22]:
# Prediction to D status
xD = train_df.drop(['id', 'Status'], axis=1)
yD = trainStatusClassD_df

print(xD.shape)
print(yD.shape)

(7905, 18)
(7905,)


In [23]:
xD_train, xD_test, yD_train, yD_test = train_test_split(xD, yD, test_size=0.3, random_state=42)

In [24]:
model_D = BaggingClassifier(estimator=ExtraTreeClassifier(max_depth=2, random_state=10), n_estimators=100)
model_D.fit(xD_train, yD_train)

In [25]:
scores_D = cross_val_score(model_D, xD_train, yD_train, cv=5)
print("cross_val_score mean =", scores_D.mean())
yD_pred = model_D.predict(xD_test)
print("accuracy_score =", accuracy_score(yD_test, yD_pred))
print("score =", model_D.score(xD_test, yD_test))

cross_val_score mean = 0.754927626431177
accuracy_score = 0.7331365935919055
score = 0.7331365935919055


# Data Wrangling (Test Data)
This section is about gathering data, assessing data, and cleaning data. I did this to synchronize the test data with the train data.

In [26]:
test_df = pd.read_csv("/kaggle/input/playground-series-s3e26/test.csv")
test_df.head()

Unnamed: 0,id,N_Days,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,7905,3839,D-penicillamine,19724,F,N,Y,N,N,1.2,546.0,3.37,65.0,1636.0,151.9,90.0,430.0,10.6,2.0
1,7906,2468,D-penicillamine,14975,F,N,N,N,N,1.1,660.0,4.22,94.0,1257.0,151.9,155.0,227.0,10.0,2.0
2,7907,51,Placebo,13149,F,N,Y,N,Y,2.0,151.0,2.96,46.0,961.0,69.75,101.0,213.0,13.0,4.0
3,7908,2330,D-penicillamine,20510,F,N,N,N,N,0.6,293.0,3.85,40.0,554.0,125.55,56.0,270.0,10.6,2.0
4,7909,1615,D-penicillamine,21904,F,N,Y,N,N,1.4,277.0,2.97,121.0,1110.0,125.0,126.0,221.0,9.8,1.0


In [27]:
test_df = test_df.drop(['id'], axis=1)

In [28]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5271 entries, 0 to 5270
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   N_Days         5271 non-null   int64  
 1   Drug           5271 non-null   object 
 2   Age            5271 non-null   int64  
 3   Sex            5271 non-null   object 
 4   Ascites        5271 non-null   object 
 5   Hepatomegaly   5271 non-null   object 
 6   Spiders        5271 non-null   object 
 7   Edema          5271 non-null   object 
 8   Bilirubin      5271 non-null   float64
 9   Cholesterol    5271 non-null   float64
 10  Albumin        5271 non-null   float64
 11  Copper         5271 non-null   float64
 12  Alk_Phos       5271 non-null   float64
 13  SGOT           5271 non-null   float64
 14  Tryglicerides  5271 non-null   float64
 15  Platelets      5271 non-null   float64
 16  Prothrombin    5271 non-null   float64
 17  Stage          5271 non-null   float64
dtypes: float

In [29]:
drugT_labels = test_df['Drug'].unique().tolist()
sexT_labels = test_df['Sex'].unique().tolist()
ascitesT_labels = test_df['Ascites'].unique().tolist()
hepatomegalyT_labels = test_df['Hepatomegaly'].unique().tolist()
spidersT_labels = test_df['Spiders'].unique().tolist()
edemaT_labels = test_df['Edema'].unique().tolist()

In [30]:
test_df["Drug"] = test_df["Drug"].apply(lambda x: drugT_labels.index(x))
test_df["Sex"] = test_df["Sex"].apply(lambda x: sexT_labels.index(x))
test_df["Ascites"] = test_df["Ascites"].apply(lambda x: ascitesT_labels.index(x))
test_df["Hepatomegaly"] = test_df["Hepatomegaly"].apply(lambda x: hepatomegalyT_labels.index(x))
test_df["Spiders"] = test_df["Spiders"].apply(lambda x: spidersT_labels.index(x))
test_df["Edema"] = test_df["Edema"].apply(lambda x: edemaT_labels.index(x))

In [31]:
test_df.head()

Unnamed: 0,N_Days,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,3839,0,19724,0,0,0,0,0,1.2,546.0,3.37,65.0,1636.0,151.9,90.0,430.0,10.6,2.0
1,2468,0,14975,0,0,1,0,0,1.1,660.0,4.22,94.0,1257.0,151.9,155.0,227.0,10.0,2.0
2,51,1,13149,0,0,0,0,1,2.0,151.0,2.96,46.0,961.0,69.75,101.0,213.0,13.0,4.0
3,2330,0,20510,0,0,1,0,0,0.6,293.0,3.85,40.0,554.0,125.55,56.0,270.0,10.6,2.0
4,1615,0,21904,0,0,0,0,0,1.4,277.0,2.97,121.0,1110.0,125.0,126.0,221.0,9.8,1.0


# Test Data Prediction
This section is about prediction. Since the submission example uses probability, I used predict_proba to make the prediction.

In [32]:
testC_pred = model_C.predict_proba(test_df)[:,1]
print(testC_pred)
testCL_pred = model_CL.predict_proba(test_df)[:,1]
print(testCL_pred)
testD_pred = model_D.predict_proba(test_df)[:,1]
print(testD_pred)

[0.74479631 0.60593658 0.6095399  ... 0.7561901  0.62358466 0.57897365]
[0.0282138  0.04065909 0.04118936 ... 0.02975013 0.04145372 0.03182538]
[0.22987333 0.34462578 0.41740456 ... 0.22046478 0.34498613 0.36227679]


In [33]:
submission_df = pd.read_csv("/kaggle/input/playground-series-s3e26/sample_submission.csv")
submission_df

Unnamed: 0,id,Status_C,Status_CL,Status_D
0,7905,0.628084,0.034788,0.337128
1,7906,0.628084,0.034788,0.337128
2,7907,0.628084,0.034788,0.337128
3,7908,0.628084,0.034788,0.337128
4,7909,0.628084,0.034788,0.337128
...,...,...,...,...
5266,13171,0.628084,0.034788,0.337128
5267,13172,0.628084,0.034788,0.337128
5268,13173,0.628084,0.034788,0.337128
5269,13174,0.628084,0.034788,0.337128


In [34]:
submission = pd.DataFrame({'id': submission_df['id'],'Status_C': testC_pred, 'Status_CL': testCL_pred, 'Status_D': testD_pred})
submission

Unnamed: 0,id,Status_C,Status_CL,Status_D
0,7905,0.744796,0.028214,0.229873
1,7906,0.605937,0.040659,0.344626
2,7907,0.609540,0.041189,0.417405
3,7908,0.630226,0.031700,0.326352
4,7909,0.738574,0.032910,0.239425
...,...,...,...,...
5266,13171,0.606528,0.040096,0.342504
5267,13172,0.621444,0.032547,0.340214
5268,13173,0.756190,0.029750,0.220465
5269,13174,0.623585,0.041454,0.344986


In [35]:
submission.to_csv('/kaggle/working/submission.csv', index=False, header=True)