In [1]:
import random
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report

In [2]:
def get_X_y(indigo_spl_npi, outliers_path, spl_lis, npi_upper_features):
    X = []
    y = []
    npi_lis_spl = []
    all_spl = list(set(indigo_spl_npi.keys()))
    for spl in all_spl:
        label = 0
        if spl in spl_lis:
            label = 1
        outlier_cur = pd.read_pickle(outliers_path + "outlier_npis" + spl + ".pkl")
        for npi in indigo_spl_npi[spl]:
            if npi not in outlier_cur:
                X.append(npi_upper_features[npi])
                y.append(label)
                if label == 1:
                    npi_lis_spl.append(npi)
    return np.array(X), np.array(y), npi_lis_spl

In [3]:
def get_indigo_spl_npi(npi_ccspcs_features_dict, npi_indigo_spl):
    indigo_spl_npi = {}
    for npi in npi_ccspcs_features_dict:
        if int(npi) in npi_indigo_spl:
            if npi_indigo_spl[int(npi)] not in indigo_spl_npi:
                indigo_spl_npi[npi_indigo_spl[int(npi)]] = []
            indigo_spl_npi[npi_indigo_spl[int(npi)]].append(npi)
    return indigo_spl_npi

In [4]:
def get_npi_ccspcs_dict():
    npi_ccspcs_features = pd.read_pickle('./ccspcs_npi_features.pkl')
    npi_ccspcs_features_np = np.array(npi_ccspcs_features)
    npi_ccspcs_features_dict = {}
    for row in npi_ccspcs_features_np:
        if sum(row[1:]) > 0:
            npi_ccspcs_features_dict[row[0]] = row[1:]
    return npi_ccspcs_features_dict

In [5]:
def get_results(y_train, y_test, X_train, X_test, random_search, spl):
    print("X_train results,")
    predicted_probabilities = random_search.best_estimator_.predict_proba(X_train)
    predictions = np.argmax(predicted_probabilities, axis=1)
    #print(classification_report(y_train, predictions, target_names=['Others', spl]))
    train_out = classification_report(y_train, predictions, target_names=['Others', spl], output_dict = True)

    print("X_test results,")
    predicted_probabilities = random_search.best_estimator_.predict_proba(X_test)
    predictions = np.argmax(predicted_probabilities, axis=1)
    #print(classification_report(y_test, predictions, target_names=['Others', spl]))
    test_out = classification_report(y_test, predictions, target_names=['Others', spl], output_dict = True)

    # number of spl predicted as others
    count_spl_others = 0
    for i in range(len(y_test)):
        if y_test[i] == 1 and predictions[i] == 0:
            count_spl_others += 1
    print("number of spl predicted as others: ", count_spl_others)

    # num of others predicted as spl
    count_others_spl = 0
    for i in range(len(y_test)):
        if y_test[i] == 0 and predictions[i] == 1:
            count_others_spl += 1
    print("num of others predicted as spl: ", count_others_spl)

    return train_out, test_out, count_spl_others, count_others_spl

In [6]:
def overpopulate(X, y, op_multiplier, op_ratio_retained):
    X_oped = []
    y_oped = []

    for i in range(len(y)):
        if y[i] == 0:
            for _ in range(op_multiplier):
                X_oped.append(X[i])
                y_oped.append(y[i])
        elif random.uniform(0, 1) < op_ratio_retained:
            X_oped.append(X[i])
            y_oped.append(y[i])
    
    return X_oped, y_oped

In [7]:
def preprocessing_1(X):
    X_normalized = []
    for i in range(len(X)):
        X_normalized.append(np.array(X[i])/sum(X[i]))
    return X_normalized

In [8]:
def normalize(X):
    for i in range(len(X)):
        X[i] = X[i]/sum(X[i])
    return X

In [9]:
def get_distributions(X, y):
    distributions = [np.array(len(X[0])*[0.0]) for _ in range(len(set(y)))]
    for i in range(len(X)):
        if i%10000 == 0:
            print("Done: " + str(i))
        distributions[y[i]] += X[i]
    
    distributions[1] += distributions[0]
    distributions = normalize(distributions)

    return distributions


In [10]:
def get_top_k_cpts(normalized_distributions, k):
    
    all_cpt_pos = {}
    for i in range(len(normalized_distributions)):
        cur_top_agg = normalized_distributions[i].argsort()[-k:][::-1]
        for pos in cur_top_agg:
            if pos not in all_cpt_pos:
                all_cpt_pos[pos] = 0
            all_cpt_pos[pos] += 1
                
    for cpt_pos in all_cpt_pos:
        all_cpt_pos[cpt_pos] = np.log(len(normalized_distributions)/all_cpt_pos[cpt_pos])
    
    return all_cpt_pos

In [11]:
def get_idf_vector(all_cpt_pos, num_featurs):
    idf_vector = []
    for i in range(num_featurs):
        if i in all_cpt_pos:
            idf_vector.append(all_cpt_pos[i])
        else:
            idf_vector.append(0)

    return idf_vector

In [12]:
def reduce_dimention(all_cpt_idfs, X_normalized):
    cpt_pos = list(all_cpt_idfs.keys())
    print(cpt_pos)

    X_reduced = []

    for distribution in X_normalized:
        temp = []
        for pos in cpt_pos:
            if all_cpt_idfs[pos] >= 0.001:
                temp.append(distribution[pos])
        X_reduced.append(temp)
    
    return X_reduced

In [13]:
def get_tfidf(tf_matrix, idf_vector):
    tfidfs = []
    for tf in tf_matrix:
        tfidfs.append(np.multiply(tf, idf_vector[0]))
    return tfidfs

In [14]:
def train_model(X, y, param_dist, spl, tfidf=False, minority_class_op_coef=1, majority_class_data_split=0.1):
    # if tfidf:
    #     #X_oped, y = overpopulate(X, y, minority_class_op_coef, majority_class_data_split)
    #     #print(len(X_oped))
    #     X_oped_normalized = preprocessing_1(X)

    #     class_distributions = get_distributions(X_oped_normalized, y)
    #     all_cpt_pos_idf = get_top_k_cpts(class_distributions, 25)
    #     idf_vector = get_idf_vector(all_cpt_pos_idf, len(X_oped_normalized[0]))

    #     X_oped_normalized_reduced = reduce_dimention(all_cpt_pos_idf, X_oped_normalized)
    #     idf_vector_reduced = reduce_dimention(all_cpt_pos_idf, [idf_vector])

    #     #print(idf_vector_reduced)

    #     X = get_tfidf(X_oped_normalized_reduced, idf_vector_reduced)
    # y = np.array(y)
    # X = np.array(X)
    # print(X.shape)
    # print(len(y[y==0]), len(y[y==1]))
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=798, stratify=y)
    model = xgb.XGBClassifier(objective='multi:softprob', num_class=len(set(y_train)), tree_method='gpu_hist', gpu_id=0)
    random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=10, scoring='accuracy', n_jobs=-1, cv=2, verbose=3, random_state=53)
    random_search.fit(X_train, y_train)
    print("Best parameters found: ", random_search.best_params_)
    train_out, test_out, count_spl_others, count_others_spl = get_results(y_train, y_test, X_train, X_test, random_search, spl)
    return train_out, test_out, count_spl_others, count_others_spl

In [15]:
def train_binary_spl_predict_mdoel(specilities, param_dist):
    npi_ccspcs_features_dict = get_npi_ccspcs_dict()
    npi_upper_level_features = pd.read_pickle("./npi_upper_features.pkl")
    # npi_cpt_features = pd.read_pickle('./chuncked_npi_ncpcs_2019_0_.pkl')
    npi_indigo_spl = pd.read_pickle('./npi_indigo_spl.pkl')
    indigo_spl_npi_upper = get_indigo_spl_npi(npi_upper_level_features, npi_indigo_spl)
    indigo_spl_npi_ccspcs = get_indigo_spl_npi(npi_ccspcs_features_dict, npi_indigo_spl)
    # indigo_spl_npi_cpt_features = get_indigo_spl_npi(npi_cpt_features, npi_indigo_spl)

    results_test = [["Data Type", "Train / Test", "Speciality", "Precision 0", "Recall 0", "Precision 1", "Recall 1", "Macro Avg F1"]]
    results_train = [["Data Type", "Train / Test", "Speciality", "Precision 0", "Recall 0", "Precision 1", "Recall 1", "Macro Avg F1"]]

    print('Starting the run!!')
    
    for spl in specilities:
        try:
            X, y, npi_lis_spl = get_X_y(indigo_spl_npi_upper, "./outliers_v2/", specilities[spl], npi_upper_level_features)
            train_out, test_out, count_spl_others, count_others_spl = train_model(X, y, param_dist, spl)
            results_train.append(["CPT upper level features", "Train", spl, train_out['Others']['precision'], train_out['Others']['recall'], train_out[spl]['precision'], train_out[spl]['recall'], train_out['macro avg']['f1-score']])
            results_test.append(["CPT upper level features", "Test", spl, test_out['Others']['precision'], test_out['Others']['recall'], test_out[spl]['precision'], test_out[spl]['recall'], test_out['macro avg']['f1-score']])
        except:
            print('\033[1m' + '\033[91m' + spl + " not in CPT upper level features" + '\033[0m' + '\033[0m')

        try:
            X, y, npi_lis_spl = get_X_y(indigo_spl_npi_ccspcs, "./outliersccspcs/", specilities[spl], npi_ccspcs_features_dict)
            train_out, test_out, count_spl_others, count_others_spl = train_model(X, y, param_dist, spl)
            results_train.append(["CCSPCS level 3 features", "Train", spl, train_out['Others']['precision'], train_out['Others']['recall'], train_out[spl]['precision'], train_out[spl]['recall'], train_out['macro avg']['f1-score']])
            results_test.append(["CCSPCS level 3 features", "Test", spl, test_out['Others']['precision'], test_out['Others']['recall'], test_out[spl]['precision'], test_out[spl]['recall'], test_out['macro avg']['f1-score']])
        except:
            print('\033[1m' + '\033[91m' + spl + " not in CCSPCT level 3 features" + '\033[0m' + '\033[0m')

        #try:
        # X, y, npi_lis_spl = get_X_y(indigo_spl_npi_cpt_features, "./outliers_cpt/", specilities[spl], npi_cpt_features)
        # train_out, test_out, count_spl_others, count_others_spl = train_model(X, y, param_dist, spl, True, 1, 0.1)
        # results_train.append(["Top 25 Tf-Idf features", "Train", spl, train_out['Others']['precision'], train_out['Others']['recall'], train_out[spl]['precision'], train_out[spl]['recall'], train_out['macro avg']['f1-score']])
        # results_test.append(["Top 25 Tf-Idf features", "Test", spl, test_out['Others']['precision'], test_out['Others']['recall'], test_out[spl]['precision'], test_out[spl]['recall'], test_out['macro avg']['f1-score']])
        #except:
        #    print('\033[1m' + '\033[91m' + spl + " not in Top 25 Tf-Idf features" + '\033[0m' + '\033[0m')

        results_train.append(len(results_train[0])*[""])
        results_test.append(len(results_test[0])*[""])

        print('\033[1m' + '\033[92m' + "Done: " + spl + '\033[0m' + '\033[0m')
    
    df = pd.DataFrame(results_test + results_train)
    print(df.head())
    #df.to_csv('./binary_spl_prediction_tfidf_output_v3.csv', index=False)

In [16]:
param_dist = {
    'n_estimators': np.arange(50, 300, 50),
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [6, 7, 8, 9, 10, 11, 12],
    'colsample_bytree': [0.6, 0.65, 0.7, 0.75],
}

In [17]:
Specialities = {
    'Nurse': ['Advanced Practice Registered Nurse (APRN)',
    'Certified Nurse Practitioner',
    'Clinical Nurse Specialist',
    'Nurse - Student',
    'Certified Nurse Midwife (CNM)',
    'Nurse'],
    'Anesthesiology': ['Anesthesiology', 'Anesthesiology Assistant (AA)', 'CRNA'],
    'Cardiovascular': ['Cardiovascular Disease-Minor Surgery',
    'Cardiovascular Disease-No Surgery',
    'Cardiovascular Disease-Surgery',
    'Cardiac Technician',
    'Vascular-Surgery'],
    'Dermatology': ['Dermatology-Minor Surgery', 'Dermatology-No Surgery'],
    'Neurology': ['Neurology-Surgery', 'Neurology-No Surgery'],
    'Gynecology': ['Obstetrics Gynecology-Surgery',
    'Gynecology-Minor Surgery',
    'Gynecology-No Surgery',
    'Gynecology-Surgery'],
    'General': ['General Preventive Med-No Surgery', 'General NOC-Surgery'],
    'Orthopedic': ['Orthopedic Excl Back-Surgery',
    'Orthopedic Incl Back-Surgery'],
    'Ophthalmology': ['Ophthalmology-No Surgery',
    'Ophthalmology-Minor Surgery',
    'Ophthalmology-Surgery'],
    'Otorhinolaryngology': ['Otorhinolaryngology-No Surgery',
    'Otorhinolaryngology-Minor Surgery',
    'Otorhinolaryngology-Surgery'],
    'Radiology': ['Radiology Diagnostic-Minor Surgery',
    'Radiology Diagnostic-No Surgery'],
    'Pediatrics': ['Pediatrics-Minor Surgery', 'Pediatrics-No Surgery'],
    'Acupuncture': ['Acupuncture'],
    'Aerospace Medicine': ['Aerospace Medicine'],
    'Allergy': ['Allergy'],
    'Audiologist': ['Audiologist'],
    'Bariatric-Surgery': ['Bariatric-Surgery'],
    'Chiropractor': ['Chiropractor'],
    'Colon And Rectal-Surgery': ['Colon And Rectal-Surgery'],
    'Counselor': ['Counselor'],
    'Dietitian': ['Dietitian'],
    'Emergency Med-No Surgery': ['Emergency Med-No Surgery'],
    'Emergency Medical Technician (EMT)': ['Emergency Medical Technician (EMT)'],
    'Endocrinology-No Surgery': ['Endocrinology-No Surgery'],
    'Family Medicine-No Surgery': ['Family Medicine-No Surgery'],
    'Forensic Medicine': ['Forensic Medicine'],
    'Gastroenterology-No Surgery': ['Gastroenterology-No Surgery'],
    'Geriatrics-No Surgery': ['Geriatrics-No Surgery'],
    'Hand-Surgery': ['Hand-Surgery'],
    'Hematology-No Surgery': ['Hematology-No Surgery'],
    'Hospitalists': ['Hospitalists'],
    'Infectious Diseases-No Surgery': ['Infectious Diseases-No Surgery'],
    'Internal Medicine-No Surgery': ['Internal Medicine-No Surgery'],
    'Medical Assistant': ['Medical Assistant'],
    'Neonatology': ['Neonatology'],
    'Nephrology-No Surgery': ['Nephrology-No Surgery'],
    'Nuclear Medicine': ['Nuclear Medicine'],
    'O.R. Technician': ['O.R. Technician'],
    'Occupation Therapist': ['Occupation Therapist'],
    'Occupational Medicine': ['Occupational Medicine'],
    'Optometrist': ['Optometrist'],
    'Pain Medicine': ['Pain Medicine'],
    'Pathology-No Surgery': ['Pathology-No Surgery'],
    'Pharmacist': ['Pharmacist'],
    'Phlebology': ['Phlebology'],
    'Physiatry': ['Physiatry'],
    'Physical Therapist': ['Physical Therapist'],
    'Physician Assistant': ['Physician Assistant'],
    'Physicians NOC-No Surgery': ['Physicians NOC-No Surgery'],
    'Physiotherapist': ['Physiotherapist'],
    'Plastic Surgery': ['Plastic NOC-Surgery', 'Plastic Otorhinolaryngology-Surgery'],
    'Podiatrist': ['Podiatrist'],
    'Psychiatry': ['Psychiatry'],
    'Psychologist': ['Psychologist'],
    'Pulmonary Diseases-No Surgery': ['Pulmonary Diseases-No Surgery'],
    'Radiation Therapy NOC': ['Radiation Therapy NOC'],
    'Respiratory Therapist': ['Respiratory Therapist'],
    'Rheumatology-No Surgery': ['Rheumatology-No Surgery'],
    'Social Worker': ['Social Worker'],
    'Sonographer': ['Sonographer'],
    'Surgeon Assistant': ['Surgeon Assistant'],
    'Thoracic-Surgery': ['Thoracic-Surgery'],
    'Traumatic-Surgery': ['Traumatic-Surgery'],
    'Urgent Care-No Surgery': ['Urgent Care-No Surgery'],
    'Urological-Surgery': ['Urological-Surgery'],
    'Pharmacology Clinica': ['Pharmacology Clinica']
 }

In [18]:
Specialities = {
    'Anesthesiology': ['Anesthesiology', 'Anesthesiology Assistant (AA)', 'CRNA'],
}

In [19]:
train_binary_spl_predict_mdoel(Specialities, param_dist)

Starting the run!!
Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  90
num of others predicted as spl:  197
Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  143
num of others predicted as spl:  118
[1m[92mDone: Anesthesiology[0m[0m
                          0             1               2            3  \
0                 Data Type  Train / Test      Speciality  Precision 0   
1  CPT upper level features          Test  Anesthesiology     0.997413   
2   CCSPCS level 3 features          Test  Anesthesiology     0.992303   
3                                                                        
4                 Data Type  Train / Test      Speciality  Precision 0   

          4            5         6             7  
0  Recall 0  Precision 1  Recall 1  Macro Avg F1  
1  0.994354     0.908965  0.956247      0.963944  
2   0.99364     0.888784  0.868324      0.935703  
3                                                 
4  Recall 0  Precision 1  Recall 1  Macro A

In [None]:
npi_ccspcs_features_dict = get_npi_ccspcs_dict()
npi_upper_level_features = pd.read_pickle("./npi_upper_features.pkl")

In [36]:
npi_indigo_spl = pd.read_pickle('./npi_indigo_spl.pkl')
indigo_spl_npi = get_indigo_spl_npi(npi_upper_level_features, npi_indigo_spl)
X, y, npi_lis_spl = get_X_y(indigo_spl_npi, "./outliers/", ["Cardiovascular Disease-Surgery", "Cardiovascular Disease-No Surgery", "Cardiovascular Disease-Minor Surgery"], npi_upper_level_features)

In [21]:
if 'Radiology' in ['Respiratory Therapist', 'Bariatric-Surgery', 'Cardiovascular Disease-Surgery', 'Sonographer', 'Pulmonary Diseases-No Surgery', 'Cardiovascular Disease-Minor Surgery', 'Allergy', 'CRNA', 'O.R. Technician', 'Gynecology-Minor Surgery', 'Counselor', 'Aerospace Medicine', 'Anesthesiology', 'Pharmacist', 'Physician Assistant', 'Colon And Rectal-Surgery', 'Otorhinolaryngology-Minor Surgery', 'Radiology Diagnostic-No Surgery', 'Infectious Diseases-No Surgery', 'Ophthalmology-Surgery', 'Nuclear Medicine', 'Endocrinology-No Surgery', 'Advanced Practice Registered Nurse (APRN)', 'Surgeon Assistant', 'Medical Assistant', 'Cardiovascular Disease-No Surgery', 'Dietitian', 'Dermatology-No Surgery', 'Pathology-No Surgery', 'Family Medicine-No Surgery', 'Gynecology-No Surgery', 'Obstetrics Gynecology-Surgery', 'Nephrology-No Surgery', 'Traumatic-Surgery', 'Phlebology', 'Orthopedic Incl Back-Surgery', 'Radiation Therapy NOC', 'Urological-Surgery', 'Occupation Therapist', 'Pediatrics-Minor Surgery', 'General NOC-Surgery', 'Hand-Surgery', 'Rheumatology-No Surgery', 'Certified Nurse Practitioner', 'Plastic Otorhinolaryngology-Surgery', 'Neonatology', 'Otorhinolaryngology-No Surgery', 'Neurology-No Surgery', 'Optometrist', 'Nurse - Student', 'Radiology Diagnostic-Minor Surgery', 'Thoracic-Surgery', 'Gynecology-Surgery', 'Geriatrics-No Surgery', 'Psychiatry', 'Emergency Medical Technician (EMT)', 'Physiotherapist', 'Ophthalmology-Minor Surgery', 'Internal Medicine-No Surgery', 'Dermatology-Minor Surgery', 'Orthopedic Excl Back-Surgery', 'Anesthesiology Assistant (AA)', 'Hematology-No Surgery', 'Physicians NOC-No Surgery', 'Neurology-Surgery', 'Hospitalists', 'Forensic Medicine', 'Ophthalmology-No Surgery', 'Acupuncture', 'Plastic NOC-Surgery', 'Otorhinolaryngology-Surgery', 'Occupational Medicine', 'Audiologist', 'Nurse', 'Vascular-Surgery', 'Psychologist', 'Pain Medicine', 'Podiatrist', 'Chiropractor', 'Physiatry', 'Urgent Care-No Surgery', 'Gastroenterology-No Surgery', 'Pediatrics-No Surgery', 'Clinical Nurse Specialist', 'Certified Nurse Midwife (CNM)', 'General Preventive Med-No Surgery', 'Physical Therapist', 'Emergency Med-No Surgery', 'Social Worker']:
    print(True)

In [17]:
#ccspcs_family medicene
X_train, X_test, y_train, y_test, random_search = train_model(X, y, param_dist)

Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
                            precision    recall  f1-score   support

                    Others       0.94      1.00      0.97     41514
Family Medicine-No Surgery       0.89      0.39      0.54      4309

                  accuracy                           0.94     45823
                 macro avg       0.92      0.69      0.75     45823
              weighted avg       0.94      0.94      0.93     45823

X_test results,
                            precision    recall  f1-score   support

                    Others       0.93      0.99      0.96     17792
Family Medicine-No Surgery       0.71      0.29      0.41      1847

                  accuracy                           0.92     19639
                 macro avg       0.82      0.64      0.69     19639
              weighted avg       0.91      0.92      0.91     19639

number of spl predicted as others:  

In [23]:
#npi_upper_level_family_medicene
X_train, X_test, y_train, y_test, random_search = train_model(X, y, param_dist)

113594 9551
Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
                            precision    recall  f1-score   support

                    Others       0.95      0.99      0.97     79515
Family Medicine-No Surgery       0.85      0.40      0.54      6686

                  accuracy                           0.95     86201
                 macro avg       0.90      0.70      0.76     86201
              weighted avg       0.94      0.95      0.94     86201

X_test results,
                            precision    recall  f1-score   support

                    Others       0.94      0.99      0.97     34079
Family Medicine-No Surgery       0.71      0.31      0.43      2865

                  accuracy                           0.94     36944
                 macro avg       0.83      0.65      0.70     36944
              weighted avg       0.93      0.94      0.93     36944

number of spl predicted as others:  

In [34]:
#ccspcs_cardiologist
X_train, X_test, y_train, y_test, random_search = train_model(X, y, param_dist, "cardiologist")

64170 1292
Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
              precision    recall  f1-score   support

      Others       1.00      1.00      1.00     44919
cardiologist       0.96      0.85      0.90       904

    accuracy                           1.00     45823
   macro avg       0.98      0.92      0.95     45823
weighted avg       1.00      1.00      1.00     45823

X_test results,
              precision    recall  f1-score   support

      Others       0.99      1.00      1.00     19251
cardiologist       0.85      0.73      0.78       388

    accuracy                           0.99     19639
   macro avg       0.92      0.86      0.89     19639
weighted avg       0.99      0.99      0.99     19639

number of spl predicted as others:  104
num of others predicted as spl:  52


In [39]:
#npi_upper_level_cardiologist
X_train, X_test, y_train, y_test, random_search = train_model(X, y, param_dist, "cardiologist")

121150 1995
Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
              precision    recall  f1-score   support

      Others       1.00      1.00      1.00     84805
cardiologist       0.89      0.77      0.82      1396

    accuracy                           0.99     86201
   macro avg       0.94      0.88      0.91     86201
weighted avg       0.99      0.99      0.99     86201

X_test results,
              precision    recall  f1-score   support

      Others       0.99      1.00      1.00     36345
cardiologist       0.82      0.69      0.75       599

    accuracy                           0.99     36944
   macro avg       0.91      0.84      0.87     36944
weighted avg       0.99      0.99      0.99     36944

number of spl predicted as others:  186
num of others predicted as spl:  89


In [97]:
Specialities = {
    "Nurse":["Advanced Practice Registered Nurse (APRN)", "Certified Nurse Practitioner", "Clinical Nurse Specialist", "Nurse - Student", "Certified Nurse Midwife (CNM)", "Nurse"],
    "Anesthesiology":["Anesthesiology", "Anesthesiology Assistant (AA)"],
    "Cardiovascular":['Cardiovascular Disease-Minor Surgery', 'Cardiovascular Disease-No Surgery', 'Cardiovascular Disease-Surgery', 'Cardiac Technician', "Vascular-Surgery"],
    "Dermatology": ['Dermatology-Minor Surgery', 'Dermatology-No Surgery'],
    "Neurology":[ 'Neurology-No Surgery', 'Neurology-Surgery'],
    "Gynecology":["Obstetrics Gynecology-Surgery", "Gynecology-Minor Surgery", "Gynecology-No Surgery", "Gynecology-Surgery"],
    "Neurology":["Neurology-Surgery", "Neurology-No Surgery"],
    "General":["General Preventive Med-No Surgery", "General NOC-Surgery"],
    "Orthopedic":["Orthopedic Excl Back-Surgery", "Orthopedic Incl Back-Surgery"],
    "Ophthalmology":["Ophthalmology-No Surgery", "Ophthalmology-Minor Surgery", "Ophthalmology-Surgery"],
    "Otorhinolaryngology":["Otorhinolaryngology-No Surgery", "Otorhinolaryngology-Minor Surgery", "Otorhinolaryngology-Surgery"],
    "Radiology": ["Radiology Diagnostic-Minor Surgery", "Radiology Diagnostic-No Surgery"],
    'Pediatrics': ['Pediatrics-Minor Surgery', 'Pediatrics-No Surgery'],
    
}

In [98]:
import os

spl_all = [spl for upper_spl in Specialities for spl in Specialities[upper_spl]]
print(spl_all)
for file_name in os.listdir('./outliers/'):
    spl = file_name.strip('outlier_npis')
    spl = spl.strip('.pkl')
    if spl not in spl_all:
        Specialities[spl] = [spl]

for file_name in os.listdir('./outliersccspcs/'):
    spl = file_name.strip('outlier_npis')
    spl = spl.strip('.pkl')
    if spl not in spl_all:
        Specialities[spl] = [spl]

['Advanced Practice Registered Nurse (APRN)', 'Certified Nurse Practitioner', 'Clinical Nurse Specialist', 'Nurse - Student', 'Certified Nurse Midwife (CNM)', 'Nurse', 'Anesthesiology', 'Anesthesiology Assistant (AA)', 'Cardiovascular Disease-Minor Surgery', 'Cardiovascular Disease-No Surgery', 'Cardiovascular Disease-Surgery', 'Cardiac Technician', 'Vascular-Surgery', 'Dermatology-Minor Surgery', 'Dermatology-No Surgery', 'Neurology-Surgery', 'Neurology-No Surgery', 'Obstetrics Gynecology-Surgery', 'Gynecology-Minor Surgery', 'Gynecology-No Surgery', 'Gynecology-Surgery', 'General Preventive Med-No Surgery', 'General NOC-Surgery', 'Orthopedic Excl Back-Surgery', 'Orthopedic Incl Back-Surgery', 'Ophthalmology-No Surgery', 'Ophthalmology-Minor Surgery', 'Ophthalmology-Surgery', 'Otorhinolaryngology-No Surgery', 'Otorhinolaryngology-Minor Surgery', 'Otorhinolaryngology-Surgery', 'Radiology Diagnostic-Minor Surgery', 'Radiology Diagnostic-No Surgery', 'Pediatrics-Minor Surgery', 'Pediatri

In [95]:
print(Specialities)

{'Nurse': ['Advanced Practice Registered Nurse (APRN)', 'Certified Nurse Practitioner', 'Clinical Nurse Specialist', 'Nurse - Student', 'Certified Nurse Midwife (CNM)', 'Nurse'], 'Anesthesiology': ['Anesthesiology', 'Anesthesiology Assistant (AA)'], 'Cardiovascular': ['Cardiovascular Disease-Minor Surgery', 'Cardiovascular Disease-No Surgery', 'Cardiovascular Disease-Surgery', 'Cardiac Technician', 'Vascular-Surgery'], 'Dermatology': ['Dermatology-Minor Surgery', 'Dermatology-No Surgery'], 'Neurology': ['Neurology-Surgery', 'Neurology-No Surgery'], 'Gynecology': ['Obstetrics Gynecology-Surgery', 'Gynecology-Minor Surgery', 'Gynecology-No Surgery', 'Gynecology-Surgery'], 'General': ['General Preventive Med-No Surgery', 'General NOC-Surgery'], 'Orthopedic': ['Orthopedic Excl Back-Surgery', 'Orthopedic Incl Back-Surgery'], 'Ophthalmology': ['Ophthalmology-No Surgery', 'Ophthalmology-Minor Surgery', 'Ophthalmology-Surgery'], 'Otorhinolaryngology': ['Otorhinolaryngology-No Surgery', 'Otorhi

In [101]:
len(Specialities)

68

In [102]:
Specialities

{'Nurse': ['Advanced Practice Registered Nurse (APRN)',
  'Certified Nurse Practitioner',
  'Clinical Nurse Specialist',
  'Nurse - Student',
  'Certified Nurse Midwife (CNM)',
  'Nurse'],
 'Anesthesiology': ['Anesthesiology', 'Anesthesiology Assistant (AA)'],
 'Cardiovascular': ['Cardiovascular Disease-Minor Surgery',
  'Cardiovascular Disease-No Surgery',
  'Cardiovascular Disease-Surgery',
  'Cardiac Technician',
  'Vascular-Surgery'],
 'Dermatology': ['Dermatology-Minor Surgery', 'Dermatology-No Surgery'],
 'Neurology': ['Neurology-Surgery', 'Neurology-No Surgery'],
 'Gynecology': ['Obstetrics Gynecology-Surgery',
  'Gynecology-Minor Surgery',
  'Gynecology-No Surgery',
  'Gynecology-Surgery'],
 'General': ['General Preventive Med-No Surgery', 'General NOC-Surgery'],
 'Orthopedic': ['Orthopedic Excl Back-Surgery',
  'Orthopedic Incl Back-Surgery'],
 'Ophthalmology': ['Ophthalmology-No Surgery',
  'Ophthalmology-Minor Surgery',
  'Ophthalmology-Surgery'],
 'Otorhinolaryngology': ['O

In [103]:
Specialities = {
    'Nurse': ['Advanced Practice Registered Nurse (APRN)',
    'Certified Nurse Practitioner',
    'Clinical Nurse Specialist',
    'Nurse - Student',
    'Certified Nurse Midwife (CNM)',
    'Nurse'],
    'Anesthesiology': ['Anesthesiology', 'Anesthesiology Assistant (AA)'],
    'Cardiovascular': ['Cardiovascular Disease-Minor Surgery',
    'Cardiovascular Disease-No Surgery',
    'Cardiovascular Disease-Surgery',
    'Cardiac Technician',
    'Vascular-Surgery'],
    'Dermatology': ['Dermatology-Minor Surgery', 'Dermatology-No Surgery'],
    'Neurology': ['Neurology-Surgery', 'Neurology-No Surgery'],
    'Gynecology': ['Obstetrics Gynecology-Surgery',
    'Gynecology-Minor Surgery',
    'Gynecology-No Surgery',
    'Gynecology-Surgery'],
    'General': ['General Preventive Med-No Surgery', 'General NOC-Surgery'],
    'Orthopedic': ['Orthopedic Excl Back-Surgery',
    'Orthopedic Incl Back-Surgery'],
    'Ophthalmology': ['Ophthalmology-No Surgery',
    'Ophthalmology-Minor Surgery',
    'Ophthalmology-Surgery'],
    'Otorhinolaryngology': ['Otorhinolaryngology-No Surgery',
    'Otorhinolaryngology-Minor Surgery',
    'Otorhinolaryngology-Surgery'],
    'Radiology': ['Radiology Diagnostic-Minor Surgery',
    'Radiology Diagnostic-No Surgery'],
    'Pediatrics': ['Pediatrics-Minor Surgery', 'Pediatrics-No Surgery'],
    'Acupuncture': ['Acupuncture'],
    'Aerospace Medicine': ['Aerospace Medicine'],
    'Allergy': ['Allergy'],
    'Audiologist': ['Audiologist'],
    'Bariatric-Surgery': ['Bariatric-Surgery'],
    'Chiropractor': ['Chiropractor'],
    'Colon And Rectal-Surgery': ['Colon And Rectal-Surgery'],
    'Counselor': ['Counselor'],
    'CRNA': ['CRNA'],
    'Dietitian': ['Dietitian'],
    'Emergency Med-No Surgery': ['Emergency Med-No Surgery'],
    'Emergency Medical Technician (EMT)': ['Emergency Medical Technician (EMT)'],
    'Endocrinology-No Surgery': ['Endocrinology-No Surgery'],
    'Family Medicine-No Surgery': ['Family Medicine-No Surgery'],
    'Forensic Medicine': ['Forensic Medicine'],
    'Gastroenterology-No Surgery': ['Gastroenterology-No Surgery'],
    'Geriatrics-No Surgery': ['Geriatrics-No Surgery'],
    'Hand-Surgery': ['Hand-Surgery'],
    'Hematology-No Surgery': ['Hematology-No Surgery'],
    'Hospitalists': ['Hospitalists'],
    'Infectious Diseases-No Surgery': ['Infectious Diseases-No Surgery'],
    'Internal Medicine-No Surgery': ['Internal Medicine-No Surgery'],
    'Medical Assistant': ['Medical Assistant'],
    'Neonatology': ['Neonatology'],
    'Nephrology-No Surgery': ['Nephrology-No Surgery'],
    'Nuclear Medicine': ['Nuclear Medicine'],
    'O.R. Technician': ['O.R. Technician'],
    'Occupation Therapist': ['Occupation Therapist'],
    'Occupational Medicine': ['Occupational Medicine'],
    'Optometrist': ['Optometrist'],
    'Pain Medicine': ['Pain Medicine'],
    'Pathology-No Surgery': ['Pathology-No Surgery'],
    'Pharmacist': ['Pharmacist'],
    'Phlebology': ['Phlebology'],
    'Physiatry': ['Physiatry'],
    'Physical Therapist': ['Physical Therapist'],
    'Physician Assistant': ['Physician Assistant'],
    'Physicians NOC-No Surgery': ['Physicians NOC-No Surgery'],
    'Physiotherapist': ['Physiotherapist'],
    'Plastic NOC-Surgery': ['Plastic NOC-Surgery'],
    'Plastic Otorhinolaryngology-Surgery': ['Plastic Otorhinolaryngology-Surgery'],
    'Podiatrist': ['Podiatrist'],
    'Psychiatry': ['Psychiatry'],
    'Psychologist': ['Psychologist'],
    'Pulmonary Diseases-No Surgery': ['Pulmonary Diseases-No Surgery'],
    'Radiation Therapy NOC': ['Radiation Therapy NOC'],
    'Respiratory Therapist': ['Respiratory Therapist'],
    'Rheumatology-No Surgery': ['Rheumatology-No Surgery'],
    'Social Worker': ['Social Worker'],
    'Sonographer': ['Sonographer'],
    'Surgeon Assistant': ['Surgeon Assistant'],
    'Thoracic-Surgery': ['Thoracic-Surgery'],
    'Traumatic-Surgery': ['Traumatic-Surgery'],
    'Urgent Care-No Surgery': ['Urgent Care-No Surgery'],
    'Urological-Surgery': ['Urological-Surgery'],
    'Pharmacology Clinica': ['Pharmacology Clinica']
 }