In [6]:
!pip3 install tensorflow

Collecting tensorflow
  Using cached tensorflow-2.13.0-cp310-cp310-win_amd64.whl (1.9 kB)
Collecting tensorflow-intel==2.13.0 (from tensorflow)
  Using cached tensorflow_intel-2.13.0-cp310-cp310-win_amd64.whl (276.5 MB)
Collecting absl-py>=1.0.0 (from tensorflow-intel==2.13.0->tensorflow)
  Using cached absl_py-1.4.0-py3-none-any.whl (126 kB)
Collecting astunparse>=1.6.0 (from tensorflow-intel==2.13.0->tensorflow)
  Using cached astunparse-1.6.3-py2.py3-none-any.whl (12 kB)
Collecting gast<=0.4.0,>=0.2.1 (from tensorflow-intel==2.13.0->tensorflow)
  Using cached gast-0.4.0-py3-none-any.whl (9.8 kB)
Collecting google-pasta>=0.1.1 (from tensorflow-intel==2.13.0->tensorflow)
  Using cached google_pasta-0.2.0-py3-none-any.whl (57 kB)
Collecting h5py>=2.9.0 (from tensorflow-intel==2.13.0->tensorflow)
  Using cached h5py-3.9.0-cp310-cp310-win_amd64.whl (2.7 MB)
Collecting opt-einsum>=2.3.2 (from tensorflow-intel==2.13.0->tensorflow)
  Using cached opt_einsum-3.3.0-py3-none-any.whl (65 kB)
Co

In [8]:
!nvidia-smi

Mon Aug  7 23:55:57 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 528.90       Driver Version: 528.90       CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA RTX A550... WDDM  | 00000000:01:00.0  On |                  Off |
| N/A   44C    P3    29W /  55W |    544MiB / 16384MiB |      1%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [26]:
import pickle
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, make_scorer, classification_report
import pandas as pd
import numpy as np
import random

In [27]:
def preprocessing_1(X):
    X_normalized = []
    for i in range(len(X)):
        X_normalized.append(np.array(X[i])/sum(X[i]))
    return X_normalized

In [28]:
def save_pickle(file_name, file_path):
    with open(file_path, 'wb') as fp:
        pickle.dump(file_name, fp)

In [29]:
def remove_nan(npi_indigo_spl):
    npi_indigo_spl_non_nan = {}
    for npi in npi_indigo_spl:
        if isinstance(npi_indigo_spl[npi], str):
            npi_indigo_spl_non_nan[npi] = npi_indigo_spl[npi]
    return npi_indigo_spl_non_nan

In [30]:
def create_dataset(npi_distributions, npi_indigo_spl):
    dataset = {'NPI':[], 'features':[], 'labels':[]}
    for npi in npi_distributions:
        if int(npi) in npi_indigo_spl:
            dataset['NPI'].append(npi)
            dataset['features'].append(npi_distributions[npi])
            dataset['labels'].append(npi_indigo_spl[int(npi)])
    return dataset

In [31]:
def get_less_than_threshold_categories(y, threshold):
    labels_count = {}
    for lab in y:
        if lab in labels_count:
            labels_count[lab] += 1
        else:
            labels_count[lab] = 1

    useless = []
    for lab in labels_count:
        if labels_count[lab] < threshold:
            useless.append(lab)
    return useless

In [32]:
def remove_useless(useless, y, X):
    for i in reversed(range(len(y))):
        if y[i] in useless:
            del y[i]
            del X[i]
    return X, y

In [33]:
#def encode_labels(y):
#    label_encoder = LabelEncoder()
#    return label_encoder.fit_transform(y), label_encoder

In [34]:
def sanity_check(y_train):
    check = set(y_train)

    print('Max value in input: ' +  str(max(check)))
    print('"length-1" of set of input: ' + str(len(check)-1))
    assert max(check) == (len(check)-1)

In [35]:
def remove_from_y(X_train_, y_train_, remove_list):
    
    X_train_rm = []
    y_train_rm = []

    for i in range(len(X_train_)):
        if y_train_[i] not in remove_list:
            X_train_rm.append(X_train_[i])
            y_train_rm.append(y_train_[i])
    return X_train_rm, y_train_rm

In [36]:
def check_dataset_distribution(y_train, k):
    dataset_distribution = {}
    total = 0
    for i in range(len(y_train)):
        if y_train[i] not in dataset_distribution:
            dataset_distribution[y_train[i]] = 1
        else:
            dataset_distribution[y_train[i]] += 1
        total += 1
    
    print(dataset_distribution)
    remove_list = []
    for key in dataset_distribution:
        if dataset_distribution[key] < k:
            remove_list.append(key)
    
    print(list(dataset_distribution.values()))
    print(max(list(dataset_distribution.values())))
    print(min(list(dataset_distribution.values())))
    print(np.std(list(dataset_distribution.values())))
    return remove_list

In [37]:
def encode_labels_2way(y):
    encoded_y = []
    for l in y:
        if '-Surgery' in l or '-Minor' in l:
            encoded_y.append(0)
        else:
            encoded_y.append(1)
    return encoded_y

In [38]:
def encode_labels_2way_custom(y, label):
    encoded_y = []
    for l in y:
        if label in l:
            encoded_y.append(0)
        else:
            encoded_y.append(1)
    return encoded_y

In [39]:
def overpopulate(X, y, op_multiplier, op_ratio_retained):
    X_oped = []
    y_oped = []

    for i in range(len(y)):
        if y[i] == 0:
            for j in range(op_multiplier):
                X_oped.append(X[i])
                y_oped.append(y[i])
        elif random.uniform(0, 1) < op_ratio_retained:
            X_oped.append(X[i])
            y_oped.append(y[i])
    
    return X_oped, y_oped

In [40]:
def get_top_k_cpts_with_multipliers(normalized_distributions, k):
    
    all_cpt_pos = {}
    for i in range(len(normalized_distributions)):
        cur_top_agg = normalized_distributions[i].argsort()[-k:][::-1]
        for pos in cur_top_agg:
            if pos not in all_cpt_pos:
                all_cpt_pos[pos] = 0
            all_cpt_pos[pos] += 1
                
    for cpt_pos in all_cpt_pos:
        all_cpt_pos[cpt_pos] = np.log(len(normalized_distributions)/all_cpt_pos[cpt_pos])
    
    return all_cpt_pos

In [41]:
def get_top_k_cpts(normalized_distributions, k):
    
    all_cpt_pos = {}
    for i in range(len(normalized_distributions)):
        cur_top_agg = normalized_distributions[i].argsort()[-k:][::-1]
        for pos in cur_top_agg:
            if pos not in all_cpt_pos:
                all_cpt_pos[pos] = 0
            all_cpt_pos[pos] += 1
                
    for cpt_pos in all_cpt_pos:
        all_cpt_pos[cpt_pos] = np.log(len(normalized_distributions)/all_cpt_pos[cpt_pos])
    
    return all_cpt_pos

In [42]:
def get_idf_vector(all_cpt_pos, num_featurs):
    idf_vector = []
    for i in range(num_featurs):
        if i in all_cpt_pos:
            idf_vector.append(all_cpt_pos[i])
        else:
            idf_vector.append(0)

    return idf_vector

In [43]:
def normalize(X):
    for i in range(len(X)):
        X[i] = X[i]/sum(X[i])
    return X

In [44]:
def get_distributions(X, y):
    distributions = [np.array(len(X[0])*[0.0]) for _ in range(len(set(y)))]
    for i in range(len(X)):
        if i%10000 == 0:
            print("Done: " + str(i))
        distributions[y[i]] += X[i]
    
    distributions[1] += distributions[0]
    
    distributions = normalize(distributions)

    return distributions

In [45]:
def get_tfidf(tf_matrix, idf_vector):
    tfidfs = []
    for tf in tf_matrix:
        tfidfs.append(np.multiply(tf, idf_vector[0]))
    return tfidfs

In [46]:
def reduce_dimention(all_cpt_idfs, X_normalized):
    cpt_pos = list(all_cpt_idfs.keys())
    print(cpt_pos)

    X_reduced = []

    for distribution in X_normalized:
        temp = []
        for pos in cpt_pos:
            if all_cpt_idfs[pos] >= 0.001:
                temp.append(distribution[pos])
        X_reduced.append(temp)
    
    return X_reduced

In [47]:
npi_features = pd.read_pickle('./chuncked_npi_ncpcs_2019_0_.pkl')

In [48]:
npi_indigo_spl = pd.read_pickle('./npi_indigo_spl.pkl')
npi_indigo_spl_non_nan = remove_nan(npi_indigo_spl)

In [49]:
dataset = create_dataset(npi_features, npi_indigo_spl_non_nan)

In [50]:
X = dataset['features']
y = dataset['labels']

In [51]:
print(len(X))
print(len(y))

153954
153954


Note: Maybe try a binary model of seperating surgical and non-surgical doctors and then create further indigo speciality distinctions for each of these 2 classes.

Note: Create a binary model each indigo speciality (True / False). Kind of a multi-label classification model.

In [52]:
def get_results(random_search, results_test, results_train, X_val, y_val, X_train, y_train, spl):
    predicted_probabilities = random_search.best_estimator_.predict_proba(X_val)
    predictions_test = np.argmax(predicted_probabilities, axis=1)

    predicted_probabilities = random_search.best_estimator_.predict_proba(X_train)
    predictions_train = np.argmax(predicted_probabilities, axis=1)

    test_out = classification_report(y_val, predictions_test, target_names=[spl, 'Others'], output_dict=True)
    train_out = classification_report(y_train, predictions_train, target_names=[spl, 'Others'], output_dict=True)

    results_train.append(["CPT upper level features", "Train", spl, train_out['Others']['precision'], train_out['Others']['recall'], train_out[spl]['precision'], train_out[spl]['recall'], train_out['macro avg']['f1-score']])
    results_test.append(["CPT upper level features", "Test", spl, test_out['Others']['precision'], test_out['Others']['recall'], test_out[spl]['precision'], test_out[spl]['recall'], test_out['macro avg']['f1-score']])

    return results_train, results_test

In [53]:
def spit_train_test_split(X, y, speciality, param_dist, results_test, results_train, majority_class_data_split=0.1, minority_class_op_coef=1, test_size=0.3, random_state=897):
    encoded_y = encode_labels_2way_custom(y, speciality)
    encoded_y_np = np.array(encoded_y)

    if len(encoded_y_np[encoded_y_np==0]) > 0:

        X_oped, y_oped = overpopulate(X, encoded_y, minority_class_op_coef, majority_class_data_split)
        X_oped_normalized = preprocessing_1(X_oped)

        normalized_class_distributions = get_distributions(X_oped_normalized, y_oped)
        all_cpt_pos_idf = get_top_k_cpts(normalized_class_distributions, 25)
        idf_vector = get_idf_vector(all_cpt_pos_idf, len(X_oped_normalized[0]))

        X_oped_normalized_reduced = reduce_dimention(all_cpt_pos_idf, X_oped_normalized)
        # print(np.array(X_oped_normalized_reduced).shape)
        idf_vector_reduced = reduce_dimention(all_cpt_pos_idf, [idf_vector])

        #print(idf_vector_reduced)

        X_oped_normalized_reduced_tfidfs = get_tfidf(X_oped_normalized_reduced, idf_vector_reduced)

        X_train, X_val, y_train, y_val = train_test_split(X_oped_normalized_reduced_tfidfs, y_oped, test_size=test_size, random_state=random_state, stratify=y_oped, shuffle=True)

        model = xgb.XGBClassifier(objective='multi:softprob', num_class=len(set(y_train)), tree_method='gpu_hist', gpu_id=0)
        random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=4, scoring='accuracy', n_jobs=-1, cv=2, verbose=3, random_state=53)
        random_search.fit(X_train, y_train)

        results_train, results_test = get_results(random_search, results_test, results_train, X_val, y_val, X_train, y_train, speciality)
        print('\033[1m' + '\033[92m' + "Done: " + speciality + '\033[0m' + '\033[0m')
    else:
        print('\033[1m' + '\033[91m' + speciality + " not in CPT upper level features" + '\033[0m' + '\033[0m')
    
    return results_train, results_test


In [54]:
# specialities_lis = pd.read_pickle("./specialities_lis.pkl")
specialities_lis = ["Cardiovascular Disease-Minor Surgery"]

In [55]:
param_dist = {
    'n_estimators': np.arange(50, 300, 50),
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [6, 7, 8, 9],
    'colsample_bytree': [0.6, 0.65, 0.7, 0.75],
}

results_test = [["Data Type", "Train / Test", "Speciality", "Precision 0", "Recall 0", "Precision 1", "Recall 1", "Macro Avg F1"]]
results_train = [["Data Type", "Train / Test", "Speciality", "Precision 0", "Recall 0", "Precision 1", "Recall 1", "Macro Avg F1"]]
for spl in specialities_lis:
    results_train, results_test = spit_train_test_split(X, y, spl, param_dist, results_test, results_train)
    print("Done: ", spl)
    break

df = pd.DataFrame(results_test + results_train)
# df
df.to_csv('./binary_spl_prediction_tfidf_top25_output_v1.csv', index=False)

Done: 0
Done: 10000
[15706, 14462, 15704, 15705, 15714, 15713, 14461, 15712, 15703, 2450, 15762, 2451, 12086, 407, 14457, 7394, 14463, 15755, 15733, 13929, 3837, 15715, 10815, 8934, 15753, 6073, 239, 238, 6075, 6080, 14456, 12084, 6074, 236, 14460, 7393, 12571, 13927, 189, 191, 14896, 190]
[15706, 14462, 15704, 15705, 15714, 15713, 14461, 15712, 15703, 2450, 15762, 2451, 12086, 407, 14457, 7394, 14463, 15755, 15733, 13929, 3837, 15715, 10815, 8934, 15753, 6073, 239, 238, 6075, 6080, 14456, 12084, 6074, 236, 14460, 7393, 12571, 13927, 189, 191, 14896, 190]
Fitting 2 folds for each of 4 candidates, totalling 8 fits




[1m[92mDone: Cardiovascular Disease-Minor Surgery[0m[0m
Done:  Cardiovascular Disease-Minor Surgery


Unnamed: 0,0,1,2,3,4,5,6,7
0,Data Type,Train / Test,Speciality,Precision 0,Recall 0,Precision 1,Recall 1,Macro Avg F1
1,CPT upper level features,Test,Cardiovascular Disease-Minor Surgery,0.987527,0.992865,0.772414,0.658824,0.85065
2,Data Type,Train / Test,Speciality,Precision 0,Recall 0,Precision 1,Recall 1,Macro Avg F1
3,CPT upper level features,Train,Cardiovascular Disease-Minor Surgery,0.998242,1.0,1.0,0.951899,0.987239
