In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder,MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv('dataset/kidney_stone_data.csv')
df

Unnamed: 0,treatment,stone_size,success
0,B,large,1
1,A,large,1
2,A,large,0
3,A,large,1
4,A,large,1
...,...,...,...
695,B,small,0
696,B,small,1
697,B,small,1
698,A,large,1


In [3]:
df.isnull().sum()

treatment     0
stone_size    0
success       0
dtype: int64

In [4]:
x = pd.DataFrame(df.iloc[:,0:2])

In [5]:
y = df['success']

In [6]:
x_train,x_test,y_train,y_test = train_test_split(x,y,train_size=0.8,random_state=42)

In [7]:
trf1 = ColumnTransformer(transformers=[
    ('treatment_ohe',OneHotEncoder(drop='first',sparse=False,handle_unknown='ignore'),[0])
],remainder='passthrough')

In [8]:
trf2 = ColumnTransformer(transformers=[
    ('stone_size_oe',OrdinalEncoder(categories=[['small','large']]),[1])
],remainder='passthrough')

In [9]:
#Feature selection
trf3 = SelectKBest(score_func=chi2,k=2)

In [10]:
# train the model
trf4 = LogisticRegression()

In [11]:
pipe = Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf3',trf3),
    ('trf4',trf4)
])

In [12]:
from sklearn import set_config
set_config(display='diagram')

In [13]:
pipe.fit(x_train,y_train)

In [14]:
y_pred = pipe.predict(x_test)
y_pred

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)

In [15]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.7571428571428571

**cross validation**

In [16]:
from sklearn.model_selection import cross_val_score
#cross_val_score(pipe,x_train,y_train,cv=5,scoring='accuracy')
cross_val_score(pipe,x_train,y_train,cv=5,scoring='accuracy').mean()

0.8142857142857143

In [17]:
import pickle
pickle.dump(pipe,open('ksd.pkl','wb'))