In [10]:
import sklearn
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [46]:
kidney_data = pd.read_csv('./Datasets/kidney.csv')
kidney_data.head()

Unnamed: 0,gravity,ph,osmo,cond,urea,calc,target
0,1.021,4.91,725,14.0,443,2.45,0
1,1.017,5.74,577,20.0,296,4.49,0
2,1.008,7.2,321,14.9,101,2.36,0
3,1.011,5.51,408,12.6,224,2.15,0
4,1.005,6.52,187,7.5,91,1.16,0


In [44]:
kidney_data.tail()

Unnamed: 0,gravity,ph,osmo,cond,urea,calc,target
74,1.025,7.9,721,23.6,301,9.04,1
75,1.017,4.81,410,13.3,195,0.58,1
76,1.024,5.4,803,21.8,394,7.82,1
77,1.016,6.81,594,21.4,255,12.2,1
78,1.015,6.03,416,12.8,178,9.39,1


In [16]:
kidney_data.shape

(79, 7)

In [18]:
kidney_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79 entries, 0 to 78
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   gravity  79 non-null     float64
 1   ph       79 non-null     float64
 2   osmo     79 non-null     int64  
 3   cond     79 non-null     float64
 4   urea     79 non-null     int64  
 5   calc     79 non-null     float64
 6   target   79 non-null     int64  
dtypes: float64(4), int64(3)
memory usage: 4.4 KB


In [20]:
kidney_data.isnull().sum()

gravity    0
ph         0
osmo       0
cond       0
urea       0
calc       0
target     0
dtype: int64

In [22]:
kidney_data.describe()

Unnamed: 0,gravity,ph,osmo,cond,urea,calc,target
count,79.0,79.0,79.0,79.0,79.0,79.0,79.0
mean,1.018114,6.028481,612.848101,20.813924,266.405063,4.138987,0.43038
std,0.007239,0.724307,237.514755,7.938994,131.25455,3.260051,0.498293
min,1.005,4.76,187.0,5.1,10.0,0.17,0.0
25%,1.012,5.53,413.0,14.15,160.0,1.46,0.0
50%,1.018,5.94,594.0,21.4,260.0,3.16,0.0
75%,1.0235,6.385,792.0,26.55,372.0,5.93,1.0
max,1.04,7.94,1236.0,38.0,620.0,14.34,1.0


In [26]:
kidney_data['target'].value_counts()

target
0    45
1    34
Name: count, dtype: int64

In [28]:
X = kidney_data.drop(columns='target', axis=1)
Y = kidney_data['target']

In [30]:
X

Unnamed: 0,gravity,ph,osmo,cond,urea,calc
0,1.021,4.91,725,14.0,443,2.45
1,1.017,5.74,577,20.0,296,4.49
2,1.008,7.20,321,14.9,101,2.36
3,1.011,5.51,408,12.6,224,2.15
4,1.005,6.52,187,7.5,91,1.16
...,...,...,...,...,...,...
74,1.025,7.90,721,23.6,301,9.04
75,1.017,4.81,410,13.3,195,0.58
76,1.024,5.40,803,21.8,394,7.82
77,1.016,6.81,594,21.4,255,12.20


In [32]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79 entries, 0 to 78
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   gravity  79 non-null     float64
 1   ph       79 non-null     float64
 2   osmo     79 non-null     int64  
 3   cond     79 non-null     float64
 4   urea     79 non-null     int64  
 5   calc     79 non-null     float64
dtypes: float64(4), int64(2)
memory usage: 3.8 KB


In [34]:
Y

0     0
1     0
2     0
3     0
4     0
     ..
74    1
75    1
76    1
77    1
78    1
Name: target, Length: 79, dtype: int64

In [36]:
scaler = StandardScaler()
scaler.fit(X)
standardized_data = scaler.transform(X)
standardized_data
X = standardized_data
Y = kidney_data['target']
X
Y

0     0
1     0
2     0
3     0
4     0
     ..
74    1
75    1
76    1
77    1
78    1
Name: target, Length: 79, dtype: int64

In [38]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)
print(X.shape, X_train.shape, X_test.shape)

(79, 6) (63, 6) (16, 6)


In [40]:
model = LogisticRegression(max_iter=2000)
# training the LogisticRegression model with Training data
model.fit(X_train, Y_train)
#LogisticRegression(max_iter=2000)


In [42]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print('Accuracy on Training data : ', training_data_accuracy)
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print('Accuracy on Test data : ', test_data_accuracy)

Accuracy on Training data :  0.7936507936507936
Accuracy on Test data :  0.6875


In [48]:
input_data = (2.0,5.7,470,12,354,5)

# change the input data to a numpy array
input_data_as_numpy_array= np.asarray(input_data)
print(input_data_as_numpy_array.dtype)

# reshape the numpy array as we are predicting for only on instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)
print(input_data_reshaped.dtype)

prediction = model.predict(input_data_reshaped)
print(prediction)

if (prediction[0]== 0):
  print('You do not have kidney stone.')
else:
  print('You have kidney stone(s). We recommend you visiting a Doctor soon.')

float64
float64
[0]
You do not have kidney stone.


In [52]:
import pickle
filename = 'kidney_model.sav'
pickle.dump(model, open(filename, 'wb'))
# loading the saved model
loaded_model = pickle.load(open('kidney_model.sav', 'rb'))
#for column in X.columns:
 # print(column)

In [54]:
print(sklearn.__version__)

1.5.1
