In [32]:
!pip install imbalanced-learn

Collecting imbalanced-learn
  Downloading imbalanced_learn-0.11.0-py3-none-any.whl (235 kB)
     -------------------------------------- 235.6/235.6 kB 3.6 MB/s eta 0:00:00
Installing collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.11.0


In [1]:
import pickle
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, make_scorer, classification_report
import pandas as pd
import numpy as np
import random

In [2]:
def preprocessing_1(X):
    X_normalized = []
    for i in range(len(X)):
        X_normalized.append(np.array(X[i])/sum(X[i]))
    return X_normalized

In [3]:
def save_pickle(file_name, file_path):
    with open(file_path, 'wb') as fp:
        pickle.dump(file_name, fp)

In [4]:
def remove_nan(npi_indigo_spl):
    npi_indigo_spl_non_nan = {}
    for npi in npi_indigo_spl:
        if isinstance(npi_indigo_spl[npi], str):
            npi_indigo_spl_non_nan[npi] = npi_indigo_spl[npi]
    return npi_indigo_spl_non_nan

In [5]:
def create_dataset(npi_distributions, npi_indigo_spl):
    dataset = {'NPI':[], 'features':[], 'labels':[]}
    for npi in npi_distributions:
        if int(npi) in npi_indigo_spl:
            dataset['NPI'].append(npi)
            dataset['features'].append(npi_distributions[npi])
            dataset['labels'].append(npi_indigo_spl[int(npi)])
    return dataset

In [6]:
def get_less_than_threshold_categories(y, threshold):
    labels_count = {}
    for lab in y:
        if lab in labels_count:
            labels_count[lab] += 1
        else:
            labels_count[lab] = 1

    useless = []
    for lab in labels_count:
        if labels_count[lab] < threshold:
            useless.append(lab)
    return useless

In [7]:
def remove_useless(useless, y, X):
    for i in reversed(range(len(y))):
        if y[i] in useless:
            del y[i]
            del X[i]
    return X, y

In [8]:
#def encode_labels(y):
#    label_encoder = LabelEncoder()
#    return label_encoder.fit_transform(y), label_encoder

In [9]:
def sanity_check(y_train):
    check = set(y_train)

    print('Max value in input: ' +  str(max(check)))
    print('"length-1" of set of input: ' + str(len(check)-1))
    assert max(check) == (len(check)-1)

In [10]:
def remove_from_y(X_train_, y_train_, remove_list):
    
    X_train_rm = []
    y_train_rm = []

    for i in range(len(X_train_)):
        if y_train_[i] not in remove_list:
            X_train_rm.append(X_train_[i])
            y_train_rm.append(y_train_[i])
    return X_train_rm, y_train_rm

In [11]:
def check_dataset_distribution(y_train, k):
    dataset_distribution = {}
    total = 0
    for i in range(len(y_train)):
        if y_train[i] not in dataset_distribution:
            dataset_distribution[y_train[i]] = 1
        else:
            dataset_distribution[y_train[i]] += 1
        total += 1
    
    print(dataset_distribution)
    remove_list = []
    for key in dataset_distribution:
        if dataset_distribution[key] < k:
            remove_list.append(key)
    
    print(list(dataset_distribution.values()))
    print(max(list(dataset_distribution.values())))
    print(min(list(dataset_distribution.values())))
    print(np.std(list(dataset_distribution.values())))
    return remove_list

In [12]:
def encode_labels_2way(y):
    encoded_y = []
    for l in y:
        if '-Surgery' in l or '-Minor' in l:
            encoded_y.append(0)
        else:
            encoded_y.append(1)
    return encoded_y

In [13]:
def encode_labels_2way_custom(y, label):
    encoded_y = []
    for l in y:
        if label in l:
            encoded_y.append(0)
        else:
            encoded_y.append(1)
    return encoded_y

In [14]:
def overpopulate(X, y, op_multiplier, op_ratio_retained):
    X_oped = []
    y_oped = []

    for i in range(len(y)):
        if y[i] == 0:
            for _ in range(op_multiplier):
                X_oped.append(X[i])
                y_oped.append(y[i])
        elif random.uniform(0, 1) < op_ratio_retained:
            X_oped.append(X[i])
            y_oped.append(y[i])
    
    return X_oped, y_oped

In [15]:
def get_top_k_cpts_with_multipliers(normalized_distributions, k):
    
    all_cpt_pos = {}
    for i in range(len(normalized_distributions)):
        cur_top_agg = normalized_distributions[i].argsort()[-k:][::-1]
        for pos in cur_top_agg:
            if pos not in all_cpt_pos:
                all_cpt_pos[pos] = 0
            all_cpt_pos[pos] += 1
                
    for cpt_pos in all_cpt_pos:
        all_cpt_pos[cpt_pos] = np.log(len(normalized_distributions)/all_cpt_pos[cpt_pos])
    
    return all_cpt_pos

In [48]:
def get_top_k_cpts(normalized_distributions, k, tfidf = True):
    
    all_cpt_pos = {}
    # print("Printing CPT codes with speciality")
    for i in range(len(normalized_distributions)):
        cur_top_agg = normalized_distributions[i].argsort()[-k:][::-1]
        for pos in cur_top_agg:
            if pos not in all_cpt_pos:
                all_cpt_pos[pos] = 0
            all_cpt_pos[pos] += 1

    neurology_surgery_add_on = {
        '61000': 1,
        '61020': 1,
        '61050': 1,
        '61070': 1,
        # '61100': 1,
        '61120': 1,
        '61140': 1,
        '61150': 1,
        '61210': 1,
        # '61300': 1,
        # '61310': 1,
        '61320': 1,
        '15980': 1,
        '16160': 1,
        '16190': 1,
        '16420': 1,
        '16510': 1,
        '61680': 1,
        '61690': 1,
        '61700': 1,
        '61720': 1,
        '61730': 1,
        '61750': 1,
        '61760': 1,
        '61770': 1,
        '61780': 1,
        '61790': 1,
        '61800': 1,
        '61850': 1,
        '61860': 1,
        '61880': 1,
        '62000': 1,
        '62010': 1,
        '62100': 1,
        '62110': 1,
        '62120': 1,
        '62140': 1,
        '61000': 1,
        '61001': 1,
        '61020': 1,
        '61026': 1,
        '61050': 1,
        '61055': 1,
        '61070': 1,
        '61105': 1,
        '61107': 1,
        '61108': 1,
        '61120': 1,
        '61140': 1,
        '61150': 1,
        '61151': 1,
        '61154': 1,
        '61156': 1,
        '61210': 1,
        '61304': 1,
        '61305': 1,
        '61312': 1,
        '61313': 1,
        '61314': 1,
        '61315': 1,
        '61316': 1,
        '61320': 1,
        '61321': 1,
        '61322': 1,
        '1598/': 1,
        '1616/': 1,
        '1619/': 1,
        '1642/': 1,
        '1651/': 1,
        '61680': 1,
        '61682': 1,
        '61684': 1,
        '61686': 1,
        '61690': 1,
        '61692': 1,
        '61697': 1,
        '61698': 1,
        '61700': 1,
        '61702': 1,
        '61720': 1,
        '61735': 1,
        '61736': 1,
        '61737': 1,
        '61750': 1,
        '61751': 1,
        '61760': 1,
        '61770': 1,
        '61781': 1,
        '61782': 1,
        '61796': 1,
        '61797': 1,
        '61798': 1,
        '61799': 1,
        '61800': 1,
        '61850': 1,
        '61860': 1,
        '61863': 1,
        '61864': 1,
        '61867': 1,
        '61868': 1,
        '61880': 1,
        '61885': 1,
        '61886': 1,
        '61888': 1,
        '62000': 1,
        '62005': 1,
        '62010': 1,
        '62100': 1,
        '62115': 1,
        '62117': 1,
        '62120': 1,
        '62121': 1,
        '62140': 1,
        '62141': 1
        }
    cpt_codes_2019 = pd.read_pickle("./grouped_hcpcs_codes2019.pkl")
    neurology_surgery_add_on_pos = {}
    for code in neurology_surgery_add_on:
        if code in cpt_codes_2019:
            neurology_surgery_add_on_pos[cpt_codes_2019.index(code)] = 1
        else:
            print(code)
    all_cpt_pos = all_cpt_pos | neurology_surgery_add_on_pos


    single = [code for code in all_cpt_pos if all_cpt_pos[code] == 1]; double = [code for code in all_cpt_pos if all_cpt_pos[code] == 2]
    print(single)
    print(double)
    print(len(double) + len(single))

    

    if tfidf:
        for cpt_pos in all_cpt_pos:
            all_cpt_pos[cpt_pos] = np.log(len(normalized_distributions)/all_cpt_pos[cpt_pos])

        
    return all_cpt_pos

In [17]:
def get_idf_vector(all_cpt_pos, num_featurs):
    idf_vector = []
    for i in range(num_featurs):
        if i in all_cpt_pos:
            idf_vector.append(all_cpt_pos[i])
        else:
            idf_vector.append(0)

    return idf_vector

In [18]:
def normalize(X):
    for i in range(len(X)):
        X[i] = X[i]/sum(X[i])
    return X

In [19]:
def get_distributions(X, y):
    distributions = [np.array(len(X[0])*[0.0]) for _ in range(len(set(y)))]
    for i in range(len(X)):
        # if i%10000 == 0:
        #     print("Done: " + str(i))
        distributions[y[i]] += X[i]
    
    distributions[1] += distributions[0]
    
    distributions = normalize(distributions)

    return distributions

In [20]:
def get_tfidf(tf_matrix, idf_vector):
    tfidfs = []
    for tf in tf_matrix:
        tfidfs.append(np.multiply(tf, idf_vector[0]))
    return tfidfs

In [21]:
def reduce_dimention(all_cpt_idfs, X_normalized):
    cpt_pos = list(all_cpt_idfs.keys())
    #print(cpt_pos)

    X_reduced = []

    for distribution in X_normalized:
        temp = []
        for pos in cpt_pos:
            if all_cpt_idfs[pos] >= 0.001:
                temp.append(distribution[pos])
        X_reduced.append(temp)
    
    return X_reduced

In [22]:
npi_features_p1 = pd.read_pickle('./chuncked_npi_ncpcs_2019_0_.pkl')
npi_features_adder = pd.read_pickle('./concerned_specialities_npi_features9.pkl')

In [23]:
npi_features = npi_features_p1 | npi_features_adder

In [24]:
npi_indigo_spl = pd.read_pickle('./npi_indigo_spl.pkl')
npi_indigo_spl_non_nan = remove_nan(npi_indigo_spl)

In [25]:
dataset = create_dataset(npi_features, npi_indigo_spl_non_nan)

In [26]:
X = dataset['features']
y = dataset['labels']

In [27]:
print(len(X))
print(len(y))

192345
192345


Note: Maybe try a binary model of seperating surgical and non-surgical doctors and then create further indigo speciality distinctions for each of these 2 classes.

Note: Create a binary model each indigo speciality (True / False). Kind of a multi-label classification model.

In [28]:
def get_results(random_search, results_test, results_train, X_val, y_val, X_train, y_train, spl, k):
    predicted_probabilities = random_search.best_estimator_.predict_proba(X_val)
    predictions_test = np.argmax(predicted_probabilities, axis=1)

    predicted_probabilities = random_search.best_estimator_.predict_proba(X_train)
    predictions_train = np.argmax(predicted_probabilities, axis=1)

    test_out = classification_report(y_val, predictions_test, target_names=[spl, 'Others'], output_dict=True)
    train_out = classification_report(y_train, predictions_train, target_names=[spl, 'Others'], output_dict=True)

    results_train.append(["CPT top " + str(k) + " codes", "Train", spl, train_out['Others']['precision'], train_out['Others']['recall'], train_out[spl]['precision'], train_out[spl]['recall'], train_out['macro avg']['f1-score'], len(y_train[y_train==0]), len(y_train[y_train==1]), random_search.best_params_])
    results_test.append(["CPT top " + str(k) + " codes", "Test", spl, test_out['Others']['precision'], test_out['Others']['recall'], test_out[spl]['precision'], test_out[spl]['recall'], test_out['macro avg']['f1-score'], len(y_val[y_val==0]), len(y_val[y_val==1]), random_search.best_params_])

    return results_train, results_test

In [51]:
def spit_train_test_split(X, y, speciality, param_dist, results_test, results_train, k=25, tfidf=True, runs_per_speciality_per_k=3, majority_class_data_split=0.1, minority_class_op_coef=1, test_size=0.3, random_state=897):
    encoded_y = encode_labels_2way_custom(y, speciality)
    encoded_y_np = np.array(encoded_y)
    # inner_temp_var = 0.2

    if len(encoded_y_np[encoded_y_np==0]) > 0:

        if len(encoded_y_np[encoded_y_np==0]) > 1500:
            runs_per_speciality_per_k = 1
        
        for _ in range(runs_per_speciality_per_k):

            X_oped, y_oped = overpopulate(X, encoded_y, minority_class_op_coef, majority_class_data_split)
            X_oped_normalized = preprocessing_1(X_oped)

            normalized_class_distributions = get_distributions(X_oped_normalized, y_oped)
            all_cpt_pos_idf = get_top_k_cpts(normalized_class_distributions, k, tfidf)
            idf_vector = get_idf_vector(all_cpt_pos_idf, len(X_oped_normalized[0]))

            X_oped_normalized_reduced = reduce_dimention(all_cpt_pos_idf, X_oped_normalized)
            # print(np.array(X_oped_normalized_reduced).shape)
            idf_vector_reduced = reduce_dimention(all_cpt_pos_idf, [idf_vector])
            # print(idf_vector_reduced)

            X_oped_normalized_reduced_tfidfs = get_tfidf(X_oped_normalized_reduced, idf_vector_reduced)
            X_oped_normalized_reduced_tfidfs = np.array(X_oped_normalized_reduced_tfidfs)
            
            X_train, X_val, y_train, y_val = train_test_split(X_oped_normalized_reduced_tfidfs, y_oped, test_size=test_size, random_state=random_state, stratify=y_oped, shuffle=True)
            # X_train, y_train = overpopulate(X_train, y_train, minority_class_op_coef, (1/inner_temp_var)*majority_class_data_split)
            print("Original dataset shape:", Counter(y_train))
            sm = SMOTE(random_state=42, sampling_strategy=0.1)
            X_train, y_train = sm.fit_resample(X_train, y_train)
            print("Original dataset shape:", Counter(y_train))
            X_train = np.array(X_train)
            y_train = np.array(y_train)

            # print(X_train.shape)
            y_train = np.array(y_train)
            # print(len(y_train[y_train==0]), len(y_train[y_train==1]))
            y_val = np.array(y_val)
            # print(len(y_val[y_val==0]), len(y_val[y_val==1]))

            model = xgb.XGBClassifier(objective='multi:softprob', num_class=len(set(y_train)), tree_method='gpu_hist', gpu_id=0)
            random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=4, scoring='accuracy', n_jobs=-1, cv=2, verbose=3, random_state=53)
            random_search.fit(X_train, y_train)
            # print("Best Estimator: ", random_search.best_params_)
            results_train, results_test = get_results(random_search, results_test, results_train, X_val, y_val, X_train, y_train, speciality, k)
            model = None
            random_search = None
        results_train.append(len(results_train[0])*[""])
        results_test.append(len(results_test[0])*[""])
        # print('\033[1m' + '\033[92m' + "Done: " + speciality + '\033[0m' + '\033[0m')
    else:
        print('\033[1m' + '\033[91m' + speciality + " not in CPT top k codes" + '\033[0m' + '\033[0m')
    
    return results_train, results_test


In [52]:
specialities_lis = ['Neurology-Surgery',]

In [53]:
param_dist = {
    'n_estimators': np.arange(50, 300, 50),
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [6, 7, 8, 9],
    'colsample_bytree': [0.6, 0.65, 0.7, 0.75],
    'reg_lambda': [0.1, 0.15, 0.2, 0.25],
}

# k_range = np.arange(50, 700, 100)
k_range = [12, 25]
save_points_every = 5
speciality_pos = 11

results_test = [["Data Type", "Train / Test", "Speciality", "Precision 0", "Recall 0", "Precision 1", "Recall 1", "Macro Avg F1", "Number of speciality Data Points", "Number of others Data Points", "Best Estimator"]]
results_train = [["Data Type", "Train / Test", "Speciality", "Precision 0", "Recall 0", "Precision 1", "Recall 1", "Macro Avg F1", "Number of speciality Data Points", "Number of others Data Points", "Best Estimator"]]
for spl in specialities_lis:
    for k in k_range:
        results_train, results_test = spit_train_test_split(X, y, spl, param_dist, results_test, results_train, k, majority_class_data_split=0.3, minority_class_op_coef=1)
    df = pd.DataFrame(results_test + results_train)
    print('\033[1m' + '\033[92m' + "Done: " + spl + '\033[0m' + '\033[0m')


15980
16160
16190
16420
16510
61730
61780
62110
1598/
1616/
1619/
1642/
1651/
61736
61737
62115
[12571, 12570, 12572, 6098, 6084, 12573, 3231, 239, 407, 238, 2450, 6080, 7393, 2451, 3947, 3948, 4281, 3949, 4120, 4035, 4047, 3952, 4049, 4155, 4269, 4229, 4351, 4052, 4053, 4313, 4211, 4105, 4350, 4186, 4169, 4171, 3970, 3971, 3972, 3973, 4273, 4200, 4147, 4236, 3950, 4173, 4316, 3951, 4048, 3954, 4258, 3869, 4036, 4259, 4037, 4264, 4252, 4050, 4304, 4353, 4195, 4340, 4202, 4354, 4355, 4185, 3967, 3968, 3930, 4101, 4102, 4103, 4104, 4224, 4225, 4131, 4132, 4203, 4133, 4134, 4187, 4387, 4243, 3974]
[6073, 6075, 14462, 14456, 14461]
89
Original dataset shape: Counter({1: 40522, 0: 213})
Original dataset shape: Counter({1: 40522, 0: 4052})
Fitting 2 folds for each of 4 candidates, totalling 8 fits




15980
16160
16190
16420
16510
61730
61780
62110
1598/
1616/
1619/
1642/
1651/
61736
61737
62115
[12571, 12570, 12572, 6098, 6084, 12573, 3231, 239, 407, 2450, 238, 6080, 7393, 2451, 3947, 3948, 4281, 3949, 4120, 4035, 4047, 3952, 4049, 4155, 4269, 4229, 4351, 4052, 4053, 4313, 4211, 4105, 4350, 4186, 4169, 4171, 3970, 3971, 3972, 3973, 4273, 4200, 4147, 4236, 3950, 4173, 4316, 3951, 4048, 3954, 4258, 3869, 4036, 4259, 4037, 4264, 4252, 4050, 4304, 4353, 4195, 4340, 4202, 4354, 4355, 4185, 3967, 3968, 3930, 4101, 4102, 4103, 4104, 4224, 4225, 4131, 4132, 4203, 4133, 4134, 4187, 4387, 4243, 3974]
[6073, 6075, 14462, 14456, 14461]
89
Original dataset shape: Counter({1: 40111, 0: 213})
Original dataset shape: Counter({1: 40111, 0: 4011})
Fitting 2 folds for each of 4 candidates, totalling 8 fits




15980
16160
16190
16420
16510
61730
61780
62110
1598/
1616/
1619/
1642/
1651/
61736
61737
62115
[12571, 12570, 12572, 6098, 6084, 12573, 3231, 239, 407, 2450, 238, 6080, 2451, 7393, 3947, 3948, 4281, 3949, 4120, 4035, 4047, 3952, 4049, 4155, 4269, 4229, 4351, 4052, 4053, 4313, 4211, 4105, 4350, 4186, 4169, 4171, 3970, 3971, 3972, 3973, 4273, 4200, 4147, 4236, 3950, 4173, 4316, 3951, 4048, 3954, 4258, 3869, 4036, 4259, 4037, 4264, 4252, 4050, 4304, 4353, 4195, 4340, 4202, 4354, 4355, 4185, 3967, 3968, 3930, 4101, 4102, 4103, 4104, 4224, 4225, 4131, 4132, 4203, 4133, 4134, 4187, 4387, 4243, 3974]
[6073, 6075, 14462, 14456, 14461]
89
Original dataset shape: Counter({1: 40111, 0: 213})
Original dataset shape: Counter({1: 40111, 0: 4011})
Fitting 2 folds for each of 4 candidates, totalling 8 fits




15980
16160
16190
16420
16510
61730
61780
62110
1598/
1616/
1619/
1642/
1651/
61736
61737
62115
[12570, 12572, 6098, 6084, 12573, 3231, 14532, 7408, 6083, 14464, 7496, 6105, 14484, 239, 407, 238, 2451, 7393, 12084, 13927, 236, 7392, 191, 189, 14896, 190, 3947, 3948, 4281, 3949, 4120, 4035, 4047, 3952, 4049, 4155, 4269, 4229, 4351, 4052, 4053, 4313, 4211, 4105, 4350, 4186, 4169, 4171, 3970, 3971, 3972, 3973, 4273, 4200, 4147, 4236, 3950, 4173, 4316, 3951, 4048, 3954, 4258, 3869, 4036, 4259, 4037, 4264, 4252, 4050, 4304, 4353, 4195, 4340, 4202, 4354, 4355, 4185, 3967, 3968, 3930, 4101, 4102, 4103, 4104, 4224, 4225, 4131, 4132, 4203, 4133, 4134, 4187, 4387, 4243, 3974]
[12571, 6073, 6075, 14462, 14456, 14461, 6074, 6080, 7394, 2450, 14460, 14457]
108
Original dataset shape: Counter({1: 40456, 0: 213})
Original dataset shape: Counter({1: 40456, 0: 4045})
Fitting 2 folds for each of 4 candidates, totalling 8 fits




15980
16160
16190
16420
16510
61730
61780
62110
1598/
1616/
1619/
1642/
1651/
61736
61737
62115
[12570, 12572, 6098, 6084, 12573, 3231, 14532, 7408, 6083, 14464, 7496, 6105, 14484, 239, 407, 238, 2451, 7393, 12084, 13927, 236, 7392, 191, 189, 14896, 12086, 3947, 3948, 4281, 3949, 4120, 4035, 4047, 3952, 4049, 4155, 4269, 4229, 4351, 4052, 4053, 4313, 4211, 4105, 4350, 4186, 4169, 4171, 3970, 3971, 3972, 3973, 4273, 4200, 4147, 4236, 3950, 4173, 4316, 3951, 4048, 3954, 4258, 3869, 4036, 4259, 4037, 4264, 4252, 4050, 4304, 4353, 4195, 4340, 4202, 4354, 4355, 4185, 3967, 3968, 3930, 4101, 4102, 4103, 4104, 4224, 4225, 4131, 4132, 4203, 4133, 4134, 4187, 4387, 4243, 3974]
[12571, 6073, 6075, 14462, 14456, 14461, 6074, 6080, 7394, 2450, 14460, 14457]
108
Original dataset shape: Counter({1: 40401, 0: 213})
Original dataset shape: Counter({1: 40401, 0: 4040})
Fitting 2 folds for each of 4 candidates, totalling 8 fits




15980
16160
16190
16420
16510
61730
61780
62110
1598/
1616/
1619/
1642/
1651/
61736
61737
62115
[12570, 12572, 6098, 6084, 12573, 3231, 14532, 7408, 6083, 14464, 7496, 6105, 14484, 239, 407, 238, 2451, 7393, 12084, 13927, 236, 7392, 191, 189, 14896, 190, 3947, 3948, 4281, 3949, 4120, 4035, 4047, 3952, 4049, 4155, 4269, 4229, 4351, 4052, 4053, 4313, 4211, 4105, 4350, 4186, 4169, 4171, 3970, 3971, 3972, 3973, 4273, 4200, 4147, 4236, 3950, 4173, 4316, 3951, 4048, 3954, 4258, 3869, 4036, 4259, 4037, 4264, 4252, 4050, 4304, 4353, 4195, 4340, 4202, 4354, 4355, 4185, 3967, 3968, 3930, 4101, 4102, 4103, 4104, 4224, 4225, 4131, 4132, 4203, 4133, 4134, 4187, 4387, 4243, 3974]
[12571, 6073, 6075, 14462, 14456, 14461, 6074, 6080, 7394, 2450, 14460, 14457]
108
Original dataset shape: Counter({1: 40081, 0: 213})
Original dataset shape: Counter({1: 40081, 0: 4008})
Fitting 2 folds for each of 4 candidates, totalling 8 fits




[1m[92mDone: Neurology-Surgery[0m[0m


In [50]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,Data Type,Train / Test,Speciality,Precision 0,Recall 0,Precision 1,Recall 1,Macro Avg F1,Number of speciality Data Points,Number of others Data Points,Best Estimator
1,CPT top 12 codes,Test,Neurology-Surgery,0.994945,0.999019,0.15,0.032967,0.525516,91,17338,"{'reg_lambda': 0.15, 'n_estimators': 250, 'max..."
2,CPT top 12 codes,Test,Neurology-Surgery,0.995025,0.999129,0.25,0.054945,0.543581,91,17217,"{'reg_lambda': 0.15, 'n_estimators': 250, 'max..."
3,CPT top 12 codes,Test,Neurology-Surgery,0.994913,0.998897,0.136364,0.032967,0.524999,91,17231,"{'reg_lambda': 0.15, 'n_estimators': 250, 'max..."
4,,,,,,,,,,,
5,CPT top 25 codes,Test,Neurology-Surgery,0.994943,0.998962,0.142857,0.032967,0.52526,91,17333,"{'reg_lambda': 0.15, 'n_estimators': 250, 'max..."
6,CPT top 25 codes,Test,Neurology-Surgery,0.995044,0.999132,0.25,0.054945,0.543587,91,17281,"{'reg_lambda': 0.15, 'n_estimators': 250, 'max..."
7,CPT top 25 codes,Test,Neurology-Surgery,0.994945,0.999365,0.214286,0.032967,0.527147,91,17332,"{'reg_lambda': 0.15, 'n_estimators': 250, 'max..."
8,,,,,,,,,,,
9,Data Type,Train / Test,Speciality,Precision 0,Recall 0,Precision 1,Recall 1,Macro Avg F1,Number of speciality Data Points,Number of others Data Points,Best Estimator


In [54]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,Data Type,Train / Test,Speciality,Precision 0,Recall 0,Precision 1,Recall 1,Macro Avg F1,Number of speciality Data Points,Number of others Data Points,Best Estimator
1,CPT top 12 codes,Test,Neurology-Surgery,0.996699,0.991133,0.180851,0.373626,0.618818,91,17367,"{'reg_lambda': 0.15, 'n_estimators': 250, 'max..."
2,CPT top 12 codes,Test,Neurology-Surgery,0.996498,0.992962,0.203947,0.340659,0.624935,91,17192,"{'reg_lambda': 0.15, 'n_estimators': 250, 'max..."
3,CPT top 12 codes,Test,Neurology-Surgery,0.996666,0.991159,0.182796,0.373626,0.619696,91,17192,"{'reg_lambda': 0.15, 'n_estimators': 250, 'max..."
4,,,,,,,,,,,
5,CPT top 25 codes,Test,Neurology-Surgery,0.996409,0.992272,0.177914,0.318681,0.611341,91,17339,"{'reg_lambda': 0.15, 'n_estimators': 250, 'max..."
6,CPT top 25 codes,Test,Neurology-Surgery,0.996696,0.993012,0.219355,0.373626,0.635637,91,17315,"{'reg_lambda': 0.15, 'n_estimators': 250, 'max..."
7,CPT top 25 codes,Test,Neurology-Surgery,0.99644,0.994062,0.227273,0.32967,0.632154,91,17178,"{'reg_lambda': 0.15, 'n_estimators': 250, 'max..."
8,,,,,,,,,,,
9,Data Type,Train / Test,Speciality,Precision 0,Recall 0,Precision 1,Recall 1,Macro Avg F1,Number of speciality Data Points,Number of others Data Points,Best Estimator


In [33]:
cpt_codes_2019 = pd.read_pickle("./grouped_hcpcs_codes2019.pkl")
singles = [12571, 12570, 12572, 6098, 6084, 12573, 3231, 239, 407, 2450, 238, 6080, 7393, 2451]
singles_code = []
doubles = [6073, 6075, 14462, 14456, 14461]
doubles_code = []
for pos in doubles:
    doubles_code.append(cpt_codes_2019[pos])
for pos in singles:
    singles_code.append(cpt_codes_2019[pos])

In [34]:
singles_code

['98941',
 '98940',
 '98943',
 '97012',
 '97014',
 '98942',
 '95951',
 '90837',
 '36415',
 '99232',
 '90834',
 '97530',
 'G0299',
 '99233']

'98941': Chiropractic Manipulative Treatment Procedures,

'98940': Chiropractic Manipulative Treatment Procedures,

'98943': Chiropractic Manipulative Treatment Procedures,

'98942': Chiropractic Manipulative Treatment Procedures,

'97012': Physical Medicine and Rehabilitation Evaluations,

'97014': Physical Medicine and Rehabilitation Evaluations,

'97530': Physical Medicine and Rehabilitation Evaluations,

'95951': Neurology and Neuromuscular Procedures,

'90837': Psychiatry Services and Procedures,

'90834': Psychiatry Services and Procedures,

'36415': Venous Procedures (Vascular Introduction and Injection Procedures),

'99232': Hospital Inpatient and Observation Care Services,

'99233': Hospital Inpatient and Observation Care Services,

'G0299',

In [35]:
doubles_code

['97110', '97140', '99214', '99203', '99213']

'97110', 

'97140', 

'99214', 

'99203', 

'99213'