In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
from sklearn.preprocessing import OrdinalEncoder

Data Preparation

In [2]:
df=pd.read_csv("resources/medicalmalpractice.csv")
df.head()

Unnamed: 0,Amount,Severity,Age,Private Attorney,Marital Status,Specialty,Insurance,Gender
0,57041,7,62,1,2,Family Practice,Private,Male
1,324976,6,38,1,2,OBGYN,No Insurance,Female
2,135383,4,34,1,2,Cardiology,Unknown,Male
3,829742,7,42,1,1,Pediatrics,No Insurance,Female
4,197675,3,60,0,2,OBGYN,Medicare/Medicaid,Female


In [3]:
df=df.rename(columns={"Amount":"amount",
                   "Severity":"severity",
                   "Age":"age",
                   "Private Attorney":"private_attorney",
                   "Marital Status":"marital_status",
                   "Specialty":"specialty",
                   "Insurance":"insurance",
                   "Gender":"gender"})
df.head()

Unnamed: 0,amount,severity,age,private_attorney,marital_status,specialty,insurance,gender
0,57041,7,62,1,2,Family Practice,Private,Male
1,324976,6,38,1,2,OBGYN,No Insurance,Female
2,135383,4,34,1,2,Cardiology,Unknown,Male
3,829742,7,42,1,1,Pediatrics,No Insurance,Female
4,197675,3,60,0,2,OBGYN,Medicare/Medicaid,Female


In [4]:
df.dtypes

amount               int64
severity             int64
age                  int64
private_attorney     int64
marital_status       int64
specialty           object
insurance           object
gender              object
dtype: object

In [5]:
df=df.sort_values(by="amount").reset_index(drop=True)
df.head()

Unnamed: 0,amount,severity,age,private_attorney,marital_status,specialty,insurance,gender
0,1576,3,75,0,4,Anesthesiology,Unknown,Male
1,1578,3,52,0,2,Internal Medicine,Private,Female
2,1589,3,32,0,2,Anesthesiology,Medicare/Medicaid,Male
3,1589,3,49,0,2,OBGYN,Unknown,Female
4,1612,3,44,0,1,Ophthamology,Unknown,Male


In [6]:
severity_bins=[0,4,8,10]
severity_labels=["mild","moderate","severe"]

In [7]:
df['severity_groups'] = pd.cut(df["severity"], bins=severity_bins, labels=severity_labels)
df.head()

Unnamed: 0,amount,severity,age,private_attorney,marital_status,specialty,insurance,gender,severity_groups
0,1576,3,75,0,4,Anesthesiology,Unknown,Male,mild
1,1578,3,52,0,2,Internal Medicine,Private,Female,mild
2,1589,3,32,0,2,Anesthesiology,Medicare/Medicaid,Male,mild
3,1589,3,49,0,2,OBGYN,Unknown,Female,mild
4,1612,3,44,0,1,Ophthamology,Unknown,Male,mild


In [8]:
df["amount"].describe()

count     79210.000000
mean     157484.554816
std      193135.093293
min        1576.000000
25%       43670.250000
50%       98131.000000
75%      154675.250000
max      926411.000000
Name: amount, dtype: float64

In [9]:
payout_bins=[0,43670,98131,154675,926412]
payout_labels=["low","low-mid","mid-high","high"]

In [10]:
df["payout_ranges"]=pd.cut(df["amount"],bins=payout_bins,labels=payout_labels)
df.head()

Unnamed: 0,amount,severity,age,private_attorney,marital_status,specialty,insurance,gender,severity_groups,payout_ranges
0,1576,3,75,0,4,Anesthesiology,Unknown,Male,mild,low
1,1578,3,52,0,2,Internal Medicine,Private,Female,mild,low
2,1589,3,32,0,2,Anesthesiology,Medicare/Medicaid,Male,mild,low
3,1589,3,49,0,2,OBGYN,Unknown,Female,mild,low
4,1612,3,44,0,1,Ophthamology,Unknown,Male,mild,low


In [11]:
df['marital_status'].replace(0, 'Divorced', inplace=True)
df['marital_status'].replace(1, 'Single', inplace=True)
df['marital_status'].replace(2, 'Married', inplace=True)
df['marital_status'].replace(3, 'Widowed', inplace=True)
df['marital_status'].replace(4, 'Unknown', inplace=True)
df.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['marital_status'].replace(0, 'Divorced', inplace=True)


Unnamed: 0,amount,severity,age,private_attorney,marital_status,specialty,insurance,gender,severity_groups,payout_ranges
0,1576,3,75,0,Unknown,Anesthesiology,Unknown,Male,mild,low
1,1578,3,52,0,Married,Internal Medicine,Private,Female,mild,low
2,1589,3,32,0,Married,Anesthesiology,Medicare/Medicaid,Male,mild,low
3,1589,3,49,0,Married,OBGYN,Unknown,Female,mild,low
4,1612,3,44,0,Single,Ophthamology,Unknown,Male,mild,low


In [12]:
df["age"].describe()

count    79210.000000
mean        42.701868
std         19.811063
min          0.000000
25%         28.000000
50%         43.000000
75%         58.000000
max         87.000000
Name: age, dtype: float64

In [13]:
age_bins = [0,19, 40, 61, 90]
age_labels = ["child","young adult", "adult", "senior"]

In [14]:
df["age_groups"]=pd.cut(df["age"],bins=age_bins,labels=age_labels)
df.head()

Unnamed: 0,amount,severity,age,private_attorney,marital_status,specialty,insurance,gender,severity_groups,payout_ranges,age_groups
0,1576,3,75,0,Unknown,Anesthesiology,Unknown,Male,mild,low,senior
1,1578,3,52,0,Married,Internal Medicine,Private,Female,mild,low,adult
2,1589,3,32,0,Married,Anesthesiology,Medicare/Medicaid,Male,mild,low,young adult
3,1589,3,49,0,Married,OBGYN,Unknown,Female,mild,low,adult
4,1612,3,44,0,Single,Ophthamology,Unknown,Male,mild,low,adult


In [15]:
df["specialty"].value_counts()

specialty
Family Practice           11436
General Surgery            9412
OBGYN                      8876
Anesthesiology             8732
Orthopedic Surgery         7272
Internal Medicine          5223
Neurology/Neurosurgery     4737
Emergency Medicine         4676
Ophthamology               3289
Cardiology                 2659
Urological Surgery         2027
Resident                   1983
Radiology                  1979
Pediatrics                 1416
Dermatology                1384
Plastic Surgeon            1364
Occupational Medicine       725
Pathology                   714
Thoracic Surgery            664
Physical Medicine           642
Name: count, dtype: int64

In [16]:
#group speicalties for machine learning
specialty_groups = {
    #Lower-cost, high-volume specialties focused on broad patient management
    "Primary Care": [
        "Family Practice", "Internal Medicine", "Pediatrics", "Occupational Medicine"], 
    #High resource use, invasive procedures, often inpatient or OR-based
    "Surgical": [
        "General Surgery","Orthopedic Surgery","OBGYN", "Urological Surgery","Thoracic Surgery", "Plastic Surgeon"],
   #High equipment usage, diagnostic focus (often cost-intensive but indirect patient care)
    "Diagnostic": [
        "Radiology", "Ophthamology", "Pathology"
    ],
    #High skill/tech requirement, often procedure-based, moderate-to-high cost
    "Specialty Care: Invasive": [
        "Anesthesiology","Neurology/Neurosurgery","Cardiology"
    ],
    #Lower procedural intensity, consultative roles, often outpatient
    "Specialty Care: Noninvasive" : [
        "Dermatology","Physical Medicine"
        ],
        #High variability in cost, urgent/intensive care setting, broad scope of needs
    "Other": ["Emergency Medicine", "Resident"]
}

In [17]:
specialty_map = {
    spec: group for group, specs in specialty_groups.items() for spec in specs
}

In [18]:
df["specialty_group"] = df["specialty"].map(specialty_map)
df.head(10)

Unnamed: 0,amount,severity,age,private_attorney,marital_status,specialty,insurance,gender,severity_groups,payout_ranges,age_groups,specialty_group
0,1576,3,75,0,Unknown,Anesthesiology,Unknown,Male,mild,low,senior,Specialty Care: Invasive
1,1578,3,52,0,Married,Internal Medicine,Private,Female,mild,low,adult,Primary Care
2,1589,3,32,0,Married,Anesthesiology,Medicare/Medicaid,Male,mild,low,young adult,Specialty Care: Invasive
3,1589,3,49,0,Married,OBGYN,Unknown,Female,mild,low,adult,Surgical
4,1612,3,44,0,Single,Ophthamology,Unknown,Male,mild,low,adult,Diagnostic
5,1614,3,43,0,Married,Anesthesiology,No Insurance,Male,mild,low,adult,Specialty Care: Invasive
6,1614,3,45,0,Unknown,Anesthesiology,Unknown,Male,mild,low,adult,Specialty Care: Invasive
7,1619,3,59,0,Married,Anesthesiology,Unknown,Male,mild,low,adult,Specialty Care: Invasive
8,1623,4,63,0,Unknown,Emergency Medicine,No Insurance,Female,mild,low,senior,Other
9,1623,3,48,0,Unknown,Pathology,Private,Female,mild,low,adult,Diagnostic


In [19]:
df["private_attorney_text"]=df["private_attorney"].map({0:"no",1:"yes"})
df.head()

Unnamed: 0,amount,severity,age,private_attorney,marital_status,specialty,insurance,gender,severity_groups,payout_ranges,age_groups,specialty_group,private_attorney_text
0,1576,3,75,0,Unknown,Anesthesiology,Unknown,Male,mild,low,senior,Specialty Care: Invasive,no
1,1578,3,52,0,Married,Internal Medicine,Private,Female,mild,low,adult,Primary Care,no
2,1589,3,32,0,Married,Anesthesiology,Medicare/Medicaid,Male,mild,low,young adult,Specialty Care: Invasive,no
3,1589,3,49,0,Married,OBGYN,Unknown,Female,mild,low,adult,Surgical,no
4,1612,3,44,0,Single,Ophthamology,Unknown,Male,mild,low,adult,Diagnostic,no


In [20]:
#save and export this df for visualizations
df_visuals=df.copy()
df_visuals.to_csv("resources/visuals_df.csv", index=False)

In [21]:
df=df.drop(columns={"amount","age","severity","age","specialty","private_attorney_text"})
df.head()

Unnamed: 0,private_attorney,marital_status,insurance,gender,severity_groups,payout_ranges,age_groups,specialty_group
0,0,Unknown,Unknown,Male,mild,low,senior,Specialty Care: Invasive
1,0,Married,Private,Female,mild,low,adult,Primary Care
2,0,Married,Medicare/Medicaid,Male,mild,low,young adult,Specialty Care: Invasive
3,0,Married,Unknown,Female,mild,low,adult,Surgical
4,0,Single,Unknown,Male,mild,low,adult,Diagnostic


In [22]:
#ordinal coding for severity and payout ranges
severity = [["mild","moderate","severe"]]
severity_encoder = OrdinalEncoder(categories=severity)
df["severity_level"] = severity_encoder.fit_transform(df[["severity_groups"]])
df.head()

Unnamed: 0,private_attorney,marital_status,insurance,gender,severity_groups,payout_ranges,age_groups,specialty_group,severity_level
0,0,Unknown,Unknown,Male,mild,low,senior,Specialty Care: Invasive,0.0
1,0,Married,Private,Female,mild,low,adult,Primary Care,0.0
2,0,Married,Medicare/Medicaid,Male,mild,low,young adult,Specialty Care: Invasive,0.0
3,0,Married,Unknown,Female,mild,low,adult,Surgical,0.0
4,0,Single,Unknown,Male,mild,low,adult,Diagnostic,0.0


In [23]:
payout = [["low","low-mid","mid-high","high"]]
payout_encoder = OrdinalEncoder(categories=payout)
df["payout_level"] = payout_encoder.fit_transform(df[["payout_ranges"]])
df.head()

Unnamed: 0,private_attorney,marital_status,insurance,gender,severity_groups,payout_ranges,age_groups,specialty_group,severity_level,payout_level
0,0,Unknown,Unknown,Male,mild,low,senior,Specialty Care: Invasive,0.0,0.0
1,0,Married,Private,Female,mild,low,adult,Primary Care,0.0,0.0
2,0,Married,Medicare/Medicaid,Male,mild,low,young adult,Specialty Care: Invasive,0.0,0.0
3,0,Married,Unknown,Female,mild,low,adult,Surgical,0.0,0.0
4,0,Single,Unknown,Male,mild,low,adult,Diagnostic,0.0,0.0


In [24]:
#non ordinal coding for remainder
df=pd.get_dummies(df, columns=["marital_status","insurance","gender","age_groups","specialty_group"],dtype=int)
df.head()

Unnamed: 0,private_attorney,severity_groups,payout_ranges,severity_level,payout_level,marital_status_Divorced,marital_status_Married,marital_status_Single,marital_status_Unknown,marital_status_Widowed,...,age_groups_child,age_groups_young adult,age_groups_adult,age_groups_senior,specialty_group_Diagnostic,specialty_group_Other,specialty_group_Primary Care,specialty_group_Specialty Care: Invasive,specialty_group_Specialty Care: Noninvasive,specialty_group_Surgical
0,0,mild,low,0.0,0.0,0,0,0,1,0,...,0,0,0,1,0,0,0,1,0,0
1,0,mild,low,0.0,0.0,0,1,0,0,0,...,0,0,1,0,0,0,1,0,0,0
2,0,mild,low,0.0,0.0,0,1,0,0,0,...,0,1,0,0,0,0,0,1,0,0
3,0,mild,low,0.0,0.0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,1
4,0,mild,low,0.0,0.0,0,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0


In [25]:
df=df.drop(columns=["severity_groups","payout_ranges"])
df.head()

Unnamed: 0,private_attorney,severity_level,payout_level,marital_status_Divorced,marital_status_Married,marital_status_Single,marital_status_Unknown,marital_status_Widowed,insurance_Medicare/Medicaid,insurance_No Insurance,...,age_groups_child,age_groups_young adult,age_groups_adult,age_groups_senior,specialty_group_Diagnostic,specialty_group_Other,specialty_group_Primary Care,specialty_group_Specialty Care: Invasive,specialty_group_Specialty Care: Noninvasive,specialty_group_Surgical
0,0,0.0,0.0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,1,0,0
1,0,0.0,0.0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
2,0,0.0,0.0,0,1,0,0,0,1,0,...,0,1,0,0,0,0,0,1,0,0
3,0,0.0,0.0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
4,0,0.0,0.0,0,0,1,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0


In [26]:
df_model=df.copy()
df_model.to_csv("resources/model_df.csv", index=False)