In [133]:
import pickle
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, make_scorer, classification_report
import pandas as pd
import numpy as np
import random

In [134]:
def save_pickle(file_name, file_path):
    with open(file_path, 'wb') as fp:
        pickle.dump(file_name, fp)

In [135]:
def normalize_practitioner_level_features(npi_cpt_features):
    npi_normalized_cpt_features = {}
    for npi in npi_cpt_features:
        npi_normalized_cpt_features[npi] = np.array(npi_cpt_features[npi])/sum(np.array(npi_cpt_features[npi]))
    return npi_normalized_cpt_features

In [136]:
def remove_outliers(npi_features, npi_indigo_spl, outliers_path):
    indigo_spl_npi = {}
    for npi in npi_features:
        if int(npi) in npi_indigo_spl:
            if npi_indigo_spl[int(npi)] not in indigo_spl_npi:
                indigo_spl_npi[npi_indigo_spl[int(npi)]] = []
            indigo_spl_npi[npi_indigo_spl[int(npi)]].append(npi)
    all_spl = list(set(indigo_spl_npi.keys())); outliers_removed_npi_features = {}
    for spl in all_spl:
        outlier_cur = pd.read_pickle(outliers_path + "outlier_npis" + spl + ".pkl")
        for npi in indigo_spl_npi[spl]:
            if npi not in outlier_cur:
                outliers_removed_npi_features[npi] = npi_features[npi]

    return outliers_removed_npi_features

In [137]:
def encode_labels(npi_indigo_spl, npi_normalized_cpt_features):
    X_normalized = []; encoded_y = []; y = []; indigo_spl_label = {}; i = 0
    for npi in npi_normalized_cpt_features:
        if int(npi) in npi_indigo_spl:
            X_normalized.append(npi_normalized_cpt_features[npi])
            y.append(npi_indigo_spl[int(npi)])
            if npi_indigo_spl[int(npi)] not in indigo_spl_label:
                indigo_spl_label[npi_indigo_spl[int(npi)]] = i
                i += 1
            encoded_y.append(indigo_spl_label[npi_indigo_spl[int(npi)]])
    return np.array(X_normalized), np.array(encoded_y), y, indigo_spl_label

In [138]:
def get_distributions(X_normalized, encoded_y, speciality, indigo_spl_label):

    spl_pos = np.where(encoded_y == indigo_spl_label[speciality])[0]
    spl_distribution = np.sum(X_normalized[spl_pos, :], axis=0)

    others_pos = np.where(encoded_y != indigo_spl_label[speciality])[0]
    others_distribution = np.sum(X_normalized[others_pos, :], axis=0)

    return spl_distribution, others_distribution

In [139]:
def get_binary_labels(encoded_y, speciality, indigo_spl_label):
    binary_encoded_y = np.zeros_like(encoded_y)
    binary_encoded_y[encoded_y == indigo_spl_label[speciality]] = 1
    return binary_encoded_y

In [140]:
def get_multipliers(k, depreciaition=0.9):
    multipliers = {}
    for i in range(1, k+1):
        for j in range(i, k+1):
            multipliers[str(i) + '-' + str(j)] = abs(i-j) * (depreciaition**(min(i, j) - 1))
            multipliers[str(j) + '-' + str(i)] = abs(i-j) * (0.9**(min(i, j) - 1))

    return multipliers

In [160]:
def get_features(k, spl_cpt_pos, others_cpt_pos, X):
    all_multipliers = get_multipliers(k)
    feature_pos = np.array(list(set(spl_cpt_pos) | set(others_cpt_pos)))
    X_reduced = X[:, feature_pos]

    multipliers = []
    for feat_pos in feature_pos:
        if feat_pos in spl_cpt_pos and feat_pos in others_cpt_pos:
            feat_pos_in_spl = np.where(spl_cpt_pos == feat_pos)[0][0]
            feat_pos_in_others = np.where(others_cpt_pos == feat_pos)[0][0]
            multipliers.append(all_multipliers[str(feat_pos_in_spl + 1) + '-' + str(feat_pos_in_others + 1)])
        else:
             multipliers.append(k)
    
    X_reduced_scaled = X_reduced * np.array(multipliers)

    return X_reduced_scaled

    # return X_reduced

In [142]:
def get_top_k_cpt_codes_features(spl_distribution, others_distribution, X, k=25):
        
        spl_cpt_pos = spl_distribution.argsort()[-k:][::-1]
        others_cpt_pos = others_distribution.argsort()[-k:][::-1]

        X_reduced_scaled = get_features(k, spl_cpt_pos, others_cpt_pos, X)

        return X_reduced_scaled

In [143]:
def get_results(y_train, y_test, X_train, X_test, random_search, spl):
    print("X_train results,")
    predicted_probabilities = random_search.best_estimator_.predict_proba(X_train)
    predictions = np.argmax(predicted_probabilities, axis=1)
    train_out = classification_report(y_train, predictions, target_names=['Others', spl], output_dict = True)

    print("X_test results,")
    predicted_probabilities = random_search.best_estimator_.predict_proba(X_test)
    predictions = np.argmax(predicted_probabilities, axis=1)
    test_out = classification_report(y_test, predictions, target_names=['Others', spl], output_dict = True)

    # number of spl predicted as others
    count_spl_others = 0
    for i in range(len(y_test)):
        if y_test[i] == 1 and predictions[i] == 0:
            count_spl_others += 1
    print("number of spl predicted as others: ", count_spl_others)

    # num of others predicted as spl
    count_others_spl = 0
    for i in range(len(y_test)):
        if y_test[i] == 0 and predictions[i] == 1:
            count_others_spl += 1
    print("num of others predicted as spl: ", count_others_spl)

    return train_out, test_out, count_spl_others, count_others_spl

In [144]:
def train_model(X, y, param_dist, spl):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=798, stratify=y)
    model = xgb.XGBClassifier(objective='multi:softprob', num_class=len(set(y_train)), tree_method='gpu_hist', gpu_id=0)
    random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=10, scoring='accuracy', n_jobs=-1, cv=2, verbose=3, random_state=53)
    random_search.fit(X_train, y_train)
    print("Best parameters found: ", random_search.best_params_)
    train_out, test_out, count_spl_others, count_others_spl = get_results(y_train, y_test, X_train, X_test, random_search, spl)
    return train_out, test_out, count_spl_others, count_others_spl

In [145]:
def save_results(results_test, results_train, save_path):
    df = pd.DataFrame(results_test + results_train)
    df.to_csv(save_path, index=False)

In [165]:
def train_model_on_specialities(specialities, X_normalized, encoded_y, indigo_spl_label, param_dist):
    results_test = [["Data Type", "Train / Test", "Speciality", "Precision 0", "Recall 0", "Precision 1", "Recall 1", "Macro Avg F1"]]
    results_train = [["Data Type", "Train / Test", "Speciality", "Precision 0", "Recall 0", "Precision 1", "Recall 1", "Macro Avg F1"]]

    for spec__lity in specialities:

        print('\033[1m' + '\033[92m' + "Starting: " + spec__lity + '\033[0m' + '\033[0m')

        spl_distribution, others_distribution = get_distributions(X_normalized, encoded_y, spec__lity, indigo_spl_label)
        binary_encoded_y = get_binary_labels(encoded_y, spec__lity, indigo_spl_label)
        X_reduced_scaled = get_top_k_cpt_codes_features(spl_distribution, others_distribution, X=X_normalized)
        
        train_out, test_out, count_spl_others, count_others_spl = train_model(X_reduced_scaled, binary_encoded_y, param_dist, spec__lity)
        results_train.append(["Top k CPT Wi Rank Importance", "Train", spec__lity, train_out['Others']['precision'], train_out['Others']['recall'], train_out[spec__lity]['precision'], train_out[spec__lity]['recall'], train_out['macro avg']['f1-score']])
        results_test.append(["Top k CPT Wi Rank Importance", "Test", spec__lity, test_out['Others']['precision'], test_out['Others']['recall'], test_out[spec__lity]['precision'], test_out[spec__lity]['recall'], test_out['macro avg']['f1-score']])

        #results_train.append(len(results_train[0])*[""])
        #results_test.append(len(results_test[0])*[""])
    
    save_results(results_test, results_train, './binary_spl_prediction_rank_importance_WiOutliers_output_v1.csv')

    return results_train, results_test



In [32]:
npi_cpt_features = pd.read_pickle('./chuncked_npi_ncpcs_2019_0_.pkl')
npi_normalized_cpt_features = normalize_practitioner_level_features(npi_cpt_features)
npi_indigo_spl = pd.read_pickle('./npi_indigo_spl.pkl')

In [161]:
#outliers_removed_npi_normalized_cpt_features = remove_outliers(npi_normalized_cpt_features, npi_indigo_spl, outliers_path="./outliers_v2/")
X_normalized, encoded_y, y, indigo_spl_label = encode_labels(npi_indigo_spl, npi_normalized_cpt_features)

In [162]:
specialities_file = pd.read_pickle('./specialities_lis.pkl')
specialities_set = list(set(y))
specialities = [spl for spl in specialities_file if spl in specialities_set]

In [163]:
param_dist = {
    'n_estimators': np.arange(50, 300, 50),
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [6, 7, 8, 9, 10, 11, 12],
    'colsample_bytree': [0.6, 0.65, 0.7, 0.75],
}

In [167]:
results_train, results_test = train_model_on_specialities(specialities, X_normalized, encoded_y, indigo_spl_label, param_dist)

[1m[92mStarting: Advanced Practice Registered Nurse (APRN)[0m[0m
Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  742
num of others predicted as spl:  0
[1m[92mStarting: Certified Nurse Practitioner[0m[0m
Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  4994
num of others predicted as spl:  296
[1m[92mStarting: Clinical Nurse Specialist[0m[0m
Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


X_test results,
number of spl predicted as others:  17
num of others predicted as spl:  0
[1m[92mStarting: Nurse - Student[0m[0m


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  1357
num of others predicted as spl:  0
[1m[92mStarting: Certified Nurse Midwife (CNM)[0m[0m
Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  31
num of others predicted as spl:  0
[1m[92mStarting: Nurse[0m[0m


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  20
num of others predicted as spl:  0
[1m[92mStarting: Anesthesiology[0m[0m
Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  496
num of others predicted as spl:  253
[1m[92mStarting: Anesthesiology Assistant (AA)[0m[0m
Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  70
num of others predicted as spl:  0
[1m[92mStarting: CRNA[0m[0m


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  532
num of others predicted as spl:  410
[1m[92mStarting: Cardiovascular Disease-Minor Surgery[0m[0m
Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  143
num of others predicted as spl:  17
[1m[92mStarting: Cardiovascular Disease-No Surgery[0m[0m
Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  229
num of others predicted as spl:  180
[1m[92mStarting: Cardiovascular Disease-Surgery[0m[0m
Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  20
num of others predicted as spl:  0
[1m[92mStarting: Vascular-Surgery[0m[0m


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  75
num of others predicted as spl:  12
[1m[92mStarting: Dermatology-Minor Surgery[0m[0m
Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  52
num of others predicted as spl:  0
[1m[92mStarting: Dermatology-No Surgery[0m[0m
Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  123
num of others predicted as spl:  114
[1m[92mStarting: Neurology-Surgery[0m[0m
Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  10
num of others predicted as spl:  0
[1m[92mStarting: Neurology-No Surgery[0m[0m


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  237
num of others predicted as spl:  1
[1m[92mStarting: Obstetrics Gynecology-Surgery[0m[0m
Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  373
num of others predicted as spl:  159
[1m[92mStarting: Gynecology-Minor Surgery[0m[0m
Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  21
num of others predicted as spl:  5
[1m[92mStarting: Gynecology-No Surgery[0m[0m
Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  74
num of others predicted as spl:  0
[1m[92mStarting: Gynecology-Surgery[0m[0m


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  37
num of others predicted as spl:  2
[1m[92mStarting: General Preventive Med-No Surgery[0m[0m
Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  49
num of others predicted as spl:  0
[1m[92mStarting: General NOC-Surgery[0m[0m
Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  1178
num of others predicted as spl:  71
[1m[92mStarting: Orthopedic Excl Back-Surgery[0m[0m
Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  250
num of others predicted as spl:  61
[1m[92mStarting: Orthopedic Incl Back-Surgery[0m[0m
Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  35
num of others predicted as spl:  0
[1m[92mStarting: Ophthalmology-No Surgery[0m[0m
Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  148
num of others predicted as spl:  76
[1m[92mStarting: Ophthalmology-Minor Surgery[0m[0m
Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  31
num of others predicted as spl:  3
[1m[92mStarting: Ophthalmology-Surgery[0m[0m
Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  4
num of others predicted as spl:  0
[1m[92mStarting: Otorhinolaryngology-No Surgery[0m[0m


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  28
num of others predicted as spl:  3
[1m[92mStarting: Otorhinolaryngology-Minor Surgery[0m[0m
Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  84
num of others predicted as spl:  56
[1m[92mStarting: Otorhinolaryngology-Surgery[0m[0m
Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  17
num of others predicted as spl:  0
[1m[92mStarting: Radiology Diagnostic-Minor Surgery[0m[0m


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  54
num of others predicted as spl:  8
[1m[92mStarting: Radiology Diagnostic-No Surgery[0m[0m
Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  149
num of others predicted as spl:  110
[1m[92mStarting: Pediatrics-Minor Surgery[0m[0m
Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  45
num of others predicted as spl:  1
[1m[92mStarting: Pediatrics-No Surgery[0m[0m
Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  805
num of others predicted as spl:  263
[1m[92mStarting: Acupuncture[0m[0m
Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  12
num of others predicted as spl:  3
[1m[92mStarting: Aerospace Medicine[0m[0m
Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


X_test results,
number of spl predicted as others:  2
num of others predicted as spl:  0
[1m[92mStarting: Allergy[0m[0m


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  47
num of others predicted as spl:  23
[1m[92mStarting: Audiologist[0m[0m
Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  11
num of others predicted as spl:  9
[1m[92mStarting: Bariatric-Surgery[0m[0m
Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


X_test results,
number of spl predicted as others:  6
num of others predicted as spl:  0
[1m[92mStarting: Chiropractor[0m[0m


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  481
num of others predicted as spl:  22
[1m[92mStarting: Colon And Rectal-Surgery[0m[0m
Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  29
num of others predicted as spl:  2
[1m[92mStarting: Counselor[0m[0m
Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  1202
num of others predicted as spl:  101
[1m[92mStarting: Dietitian[0m[0m
Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  18
num of others predicted as spl:  7
[1m[92mStarting: Emergency Med-No Surgery[0m[0m
Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  435
num of others predicted as spl:  298
[1m[92mStarting: Emergency Medical Technician (EMT)[0m[0m
Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  5
num of others predicted as spl:  0
[1m[92mStarting: Endocrinology-No Surgery[0m[0m


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  127
num of others predicted as spl:  30
[1m[92mStarting: Family Medicine-No Surgery[0m[0m
Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  2384
num of others predicted as spl:  661
[1m[92mStarting: Forensic Medicine[0m[0m
Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


X_test results,
number of spl predicted as others:  30
num of others predicted as spl:  0
[1m[92mStarting: Gastroenterology-No Surgery[0m[0m


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  122
num of others predicted as spl:  96
[1m[92mStarting: Geriatrics-No Surgery[0m[0m
Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  82
num of others predicted as spl:  0
[1m[92mStarting: Hand-Surgery[0m[0m


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  49
num of others predicted as spl:  18
[1m[92mStarting: Hematology-No Surgery[0m[0m
Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  215
num of others predicted as spl:  81
[1m[92mStarting: Hospitalists[0m[0m
Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  330
num of others predicted as spl:  1
[1m[92mStarting: Infectious Diseases-No Surgery[0m[0m
Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  168
num of others predicted as spl:  10
[1m[92mStarting: Internal Medicine-No Surgery[0m[0m
Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  3339
num of others predicted as spl:  382
[1m[92mStarting: Medical Assistant[0m[0m
Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  708
num of others predicted as spl:  0
[1m[92mStarting: Neonatology[0m[0m


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  42
num of others predicted as spl:  45
[1m[92mStarting: Nephrology-No Surgery[0m[0m
Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  99
num of others predicted as spl:  62
[1m[92mStarting: Nuclear Medicine[0m[0m
Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  18
num of others predicted as spl:  3
[1m[92mStarting: O.R. Technician[0m[0m
Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


X_test results,
number of spl predicted as others:  12
num of others predicted as spl:  0
[1m[92mStarting: Occupation Therapist[0m[0m


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  104
num of others predicted as spl:  27
[1m[92mStarting: Occupational Medicine[0m[0m
Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


X_test results,
number of spl predicted as others:  20
num of others predicted as spl:  0
[1m[92mStarting: Optometrist[0m[0m


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  131
num of others predicted as spl:  89
[1m[92mStarting: Pain Medicine[0m[0m
Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  114
num of others predicted as spl:  25
[1m[92mStarting: Pathology-No Surgery[0m[0m
Fitting 2 folds for each of 10 candidates, totalling 20 fits




Best parameters found:  {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
X_train results,
X_test results,
number of spl predicted as others:  67
num of others predicted as spl:  46
[1m[92mStarting: Pharmacist[0m[0m
Fitting 2 folds for each of 10 candidates, totalling 20 fits


In [None]:
specialities[:1]

['Advanced Practice Registered Nurse (APRN)']

In [83]:
set(y)

{'Acupuncture',
 'Advanced Practice Registered Nurse (APRN)',
 'Aerospace Medicine',
 'Allergy',
 'Anesthesiology',
 'Anesthesiology Assistant (AA)',
 'Audiologist',
 'Bariatric-Surgery',
 'CRNA',
 'Cardiovascular Disease-Minor Surgery',
 'Cardiovascular Disease-No Surgery',
 'Cardiovascular Disease-Surgery',
 'Certified Nurse Midwife (CNM)',
 'Certified Nurse Practitioner',
 'Chiropractor',
 'Clinical Nurse Specialist',
 'Colon And Rectal-Surgery',
 'Counselor',
 'Dermatology-Minor Surgery',
 'Dermatology-No Surgery',
 'Dietitian',
 'Emergency Med-No Surgery',
 'Emergency Medical Technician (EMT)',
 'Endocrinology-No Surgery',
 'Family Medicine-No Surgery',
 'Forensic Medicine',
 'Gastroenterology-No Surgery',
 'General NOC-Surgery',
 'General Preventive Med-No Surgery',
 'Geriatrics-No Surgery',
 'Gynecology-Minor Surgery',
 'Gynecology-No Surgery',
 'Gynecology-Surgery',
 'Hand-Surgery',
 'Hematology-No Surgery',
 'Hospitalists',
 'Infectious Diseases-No Surgery',
 'Internal Medicin

In [84]:
len(set(y))

89

In [80]:
X_normalized.shape

(123166, 17477)

In [12]:
test = np.array(npi_cpt_features["1003174947"])
print(test[test>0])

[227  33 158  31  11   6   1 196   3   4   3   1]


In [13]:
print(npi_normalized_cpt_features["1003174947"][npi_normalized_cpt_features["1003174947"]>0])

[0.33679525 0.04896142 0.23442136 0.04599407 0.01632047 0.00890208
 0.00148368 0.29080119 0.00445104 0.00593472 0.00445104 0.00148368]


In [16]:
npi_indigo_spl[1003174947]

'General NOC-Surgery'

In [28]:
print(len(encoded_y)); print(len(X_normalized))

153954
153954


In [24]:
set(encoded_y)

{0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88}