**Import the relevant libraries**

In [1]:
import pandas as pd
import numpy as np

import statsmodels.api as sm 


**Import the two datasets**

In [4]:
raw_data1 = pd.read_csv('/data/OBGYN_practice_dataset.csv')
OBGYN_category_v2 = pd.read_csv('data/OBGYN_categories_v2.csv')

**Merge the two dataframes in order to get the alternative categorization**

In [5]:
merged = pd.merge(raw_data1, OBGYN_category_v2, on="procedure_concept_id")


**Inspect the dataset**

In [6]:
merged.head()

Unnamed: 0,person_id,procedure_date,procedure_concept_id,concept_name_x,amount,total_paid,provider_id,provider_name,category according to domain expert,category according to process mining,concept_name_y
0,11248,8/3/20,2004407,Amputation of cervix,1,839,45192,Baptist Health & Medical Center,Procedure,Gynaecology,Amputation of cervix
1,11874,4/6/20,2004407,Amputation of cervix,1,839,52356,Boudica Female Care,Procedure,Gynaecology,Amputation of cervix
2,12745,12/20/19,2004407,Amputation of cervix,1,839,24023,dr. Mariano & Co.,Procedure,Gynaecology,Amputation of cervix
3,13133,7/29/19,2004407,Amputation of cervix,1,839,24023,dr. Mariano & Co.,Procedure,Gynaecology,Amputation of cervix
4,13844,8/31/20,2004407,Amputation of cervix,1,839,45192,Baptist Health & Medical Center,Procedure,Gynaecology,Amputation of cervix


**Create a pivot table**

In [7]:
pivot = pd.pivot_table(merged, values = 'amount', index=['person_id'], 
                       columns = 'category according to process mining',aggfunc=np.sum).reset_index()

**Inspect the pivot table**

In [8]:
pivot

category according to process mining,person_id,Consultation,Delivery,Gynaecology,Obstetrics
0,10429,4.0,,3.0,
1,10430,10.0,1.0,,8.0
2,10431,9.0,1.0,,7.0
3,10432,6.0,,,6.0
4,10433,1.0,,,
...,...,...,...,...,...
9258,19688,,1.0,,10.0
9259,19689,2.0,,2.0,
9260,19690,10.0,1.0,,8.0
9261,19691,1.0,,,


**Replace blank cells with zero**

In [9]:
raw_data = pivot.fillna(0)

**Inspect the new dataset**

In [10]:
raw_data

category according to process mining,person_id,Consultation,Delivery,Gynaecology,Obstetrics
0,10429,4.0,0.0,3.0,0.0
1,10430,10.0,1.0,0.0,8.0
2,10431,9.0,1.0,0.0,7.0
3,10432,6.0,0.0,0.0,6.0
4,10433,1.0,0.0,0.0,0.0
...,...,...,...,...,...
9258,19688,0.0,1.0,0.0,10.0
9259,19689,2.0,0.0,2.0,0.0
9260,19690,10.0,1.0,0.0,8.0
9261,19691,1.0,0.0,0.0,0.0


**Retrieve the names of the columns**

In [11]:
raw_data.columns

Index(['person_id', 'Consultation', 'Delivery', 'Gynaecology', 'Obstetrics'], dtype='object', name='category according to process mining')

**Rearrange the columns to place the target variable at the end**

In [12]:
dataset = raw_data[['person_id', 'Consultation',  'Gynaecology', 'Obstetrics','Delivery']]

**Inspect the new dataset**

In [13]:
dataset

category according to process mining,person_id,Consultation,Gynaecology,Obstetrics,Delivery
0,10429,4.0,3.0,0.0,0.0
1,10430,10.0,0.0,8.0,1.0
2,10431,9.0,0.0,7.0,1.0
3,10432,6.0,0.0,6.0,0.0
4,10433,1.0,0.0,0.0,0.0
...,...,...,...,...,...
9258,19688,0.0,0.0,10.0,1.0
9259,19689,2.0,2.0,0.0,0.0
9260,19690,10.0,0.0,8.0,1.0
9261,19691,1.0,0.0,0.0,0.0


**Inspect if the target is binary**

In [14]:
dataset.Delivery.value_counts()

0.0    8110
1.0    1143
2.0      10
Name: Delivery, dtype: int64

**Convert the target to binary values**

In [15]:
Data_with_binary_targets = dataset.copy()
Binary = Data_with_binary_targets['Delivery_binary'] = np.where(Data_with_binary_targets['Delivery'] > 0, 1, 0)
Data_with_binary_targets['Delivery_binary'] = Binary
Data_with_binary_targets

category according to process mining,person_id,Consultation,Gynaecology,Obstetrics,Delivery,Delivery_binary
0,10429,4.0,3.0,0.0,0.0,0
1,10430,10.0,0.0,8.0,1.0,1
2,10431,9.0,0.0,7.0,1.0,1
3,10432,6.0,0.0,6.0,0.0,0
4,10433,1.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...
9258,19688,0.0,0.0,10.0,1.0,1
9259,19689,2.0,2.0,0.0,0.0,0
9260,19690,10.0,0.0,8.0,1.0,1
9261,19691,1.0,0.0,0.0,0.0,0


**Confirm the binary values by counting the values in the target column**

In [16]:
Data_with_binary_targets.Delivery_binary.value_counts()

0    8110
1    1153
Name: Delivery_binary, dtype: int64

**Remove the column 'Delivery' that is non-binary**

In [17]:
Dataframe = Data_with_binary_targets.drop(columns=['Delivery'])

**Assign the predictive variables (x1) and the target variable (y)**

In [18]:
x1 = Dataframe.iloc[:,1:-1]
y = Dataframe.iloc[:,-1]


**Confirm x1**

In [19]:
x1.head()

category according to process mining,Consultation,Gynaecology,Obstetrics
0,4.0,3.0,0.0
1,10.0,0.0,8.0
2,9.0,0.0,7.0
3,6.0,0.0,6.0
4,1.0,0.0,0.0


**Confirm y**

In [20]:
y.head()

0    0
1    1
2    1
3    0
4    0
Name: Delivery_binary, dtype: int64

In [21]:
x = sm.add_constant(x1)

**Apply logistic regression to the data**

In [22]:
reg_log = sm.Logit(y,x)
results_log = reg_log.fit()

Optimization terminated successfully.
         Current function value: 0.166984
         Iterations 8


**Inspect the summary of the logistic regression**

In [23]:
results_log.summary()

0,1,2,3
Dep. Variable:,Delivery_binary,No. Observations:,9263.0
Model:,Logit,Df Residuals:,9259.0
Method:,MLE,Df Model:,3.0
Date:,"Mon, 16 Oct 2023",Pseudo R-squ.:,0.5556
Time:,11:54:30,Log-Likelihood:,-1546.8
converged:,True,LL-Null:,-3480.5
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-3.2369,0.081,-39.811,0.000,-3.396,-3.078
Consultation,0.0278,0.017,1.614,0.107,-0.006,0.061
Gynaecology,-0.7065,0.067,-10.552,0.000,-0.838,-0.575
Obstetrics,0.6712,0.021,31.271,0.000,0.629,0.713


**Evaluate the results with a confusion matrix**

In [24]:
cm = results_log.pred_table()
cm_df = pd.DataFrame(cm)
cm_df.columns = ['Predicted 0', 'Predicted 1']
cm_df = cm_df.rename(index={0:'Actual 0', 1:'Actual 1'})
cm_df  

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,7909.0,201.0
Actual 1,398.0,755.0


**Calculate the accuracy of the model**

In [25]:
(cm[0,0]+cm[1,1])/np.sum([cm])  

0.9353341250134946