# Binary Classification with a Tabular Kidney Stone Prediction Dataset

In [None]:
!pip install opendatasets

In [None]:
import opendatasets as od

dataset_name = "competitions/playground-series-s3e12"
od.download_kaggle_dataset(dataset_name, 'data')

In [None]:
import pandas as pd

df = pd.read_csv("data/playground-series-s3e12/train.csv", index_col=0)
df.head()

Unnamed: 0_level_0,gravity,ph,osmo,cond,urea,calc,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,1.013,6.19,443,14.8,124,1.45,0
1,1.025,5.4,703,23.6,394,4.18,0
2,1.009,6.13,371,24.5,159,9.04,0
3,1.021,4.91,442,20.8,398,6.63,1
4,1.021,5.53,874,17.8,385,2.21,1


In [None]:
X = df.iloc[:, 0:6]
y = df.iloc[:, 6]

In [None]:
X.head()

Unnamed: 0_level_0,gravity,ph,osmo,cond,urea,calc
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1.013,6.19,443,14.8,124,1.45
1,1.025,5.4,703,23.6,394,4.18
2,1.009,6.13,371,24.5,159,9.04
3,1.021,4.91,442,20.8,398,6.63
4,1.021,5.53,874,17.8,385,2.21


In [None]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0, 1))
X = scaler.fit_transform(X)
X

array([[0.22857143, 0.44968553, 0.24404194, 0.29483283, 0.18688525,
        0.09033169],
       [0.57142857, 0.20125786, 0.49189704, 0.56231003, 0.6295082 ,
        0.28299224],
       [0.11428571, 0.43081761, 0.17540515, 0.58966565, 0.2442623 ,
        0.62597036],
       ...,
       [0.37142857, 0.47798742, 0.25548141, 0.51975684, 0.42622951,
        0.52999294],
       [0.08571429, 0.74213836, 0.13155386, 0.22796353, 0.10655738,
        0.0606916 ],
       [0.17142857, 0.43081761, 0.16873213, 0.14589666, 0.2442623 ,
        0.00705716]])

In [None]:
y.head()

id
0    0
1    0
2    0
3    1
4    1
Name: target, dtype: int64

In [None]:
from sklearn.model_selection import train_test_split

train_X, test_X, train_y, test_y = train_test_split(
    X, y, test_size=0.25, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression

lreg = LogisticRegression(random_state=42).fit(train_X, train_y)

In [None]:
y_pred = lreg.predict(test_X)
y_pred

array([1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0])

In [44]:
from sklearn.metrics import accuracy_score, confusion_matrix

print(f'Accuracy: {accuracy_score(test_y, y_pred)}')
confusion = confusion_matrix(test_y, y_pred)
true_negative = confusion[0][0]
false_negative = confusion[0][1]
true_positive = confusion[1][0]
false_positive = confusion[1][1]
print("Confusion: ")
print(f'True Negative: {true_negative}')
print(f'False Negative: {false_negative}')
print(f'True Positive: {true_positive}')
print(f'False Positive: {false_positive}')

Accuracy: 0.7788461538461539
Confusion: 
True Negative: 47
False Negative: 6
True Positive: 17
False Positive: 34


In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(train_X, train_y)

In [None]:
knn_y_pred = knn.predict(test_X)
knn_y_pred

array([1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0])

In [43]:
print(f'Accuracy: {accuracy_score(test_y, knn_y_pred)}')
confusion = confusion_matrix(test_y, knn_y_pred)
true_negative = confusion[0][0]
false_negative = confusion[0][1]
true_positive = confusion[1][0]
false_positive = confusion[1][1]
print("Confusion: ")
print(f'True Negative: {true_negative}')
print(f'False Negative: {false_negative}')
print(f'True Positive: {true_positive}')
print(f'False Positive: {false_positive}')

Accuracy: 0.7307692307692307
Confusion: 
True Negative: 40
False Negative: 13
True Positive: 15
False Positive: 36


In [None]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier()
dtc.fit(train_X, train_y)

In [None]:
dtc_y_pred = dtc.predict(test_X)
dtc_y_pred

array([1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1])

In [41]:
print(f'Accuracy: {accuracy_score(test_y, dtc_y_pred)}')
confusion = confusion_matrix(test_y, dtc_y_pred)
true_negative = confusion[0][0]
false_negative = confusion[0][1]
true_positive = confusion[1][0]
false_positive = confusion[1][1]
print("Confusion: ")
print(f'True Negative: {true_negative}')
print(f'False Negative: {false_negative}')
print(f'True Positive: {true_positive}')
print(f'False Positive: {false_positive}')

Accuracy: 0.6730769230769231
Confusion: 
True Negative: 36
False Negative: 17
True Positive: 17
False Positive: 34
