In [34]:
!pip install mplcyberpunk

Collecting mplcyberpunk
  Downloading mplcyberpunk-0.7.0-py3-none-any.whl (6.3 kB)
Installing collected packages: mplcyberpunk
Successfully installed mplcyberpunk-0.7.0


In [63]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import pickle
import mplcyberpunk

In [2]:
npi_distribution_0 = pd.read_pickle('./chuncked_npi_ncpcs_0_.pkl')

In [None]:
nucc_speciality = pd.read_csv('./nucc_taxonomy_90.csv', encoding= 'unicode_escape')

In [4]:
def get_x_y(npi_distribution_0):
    npi_lis = []
    distributions = []
    for npi in npi_distribution_0:
        npi_lis.append(npi)
        distributions.append(npi_distribution_0[npi])
    return npi_lis, distributions

In [5]:
def normalize(npi_distribution_0, threshold):
    to_model_npi_distribution_0 = {}
    for npi in npi_distribution_0:
        if sum(npi_distribution_0[npi])>threshold:
            to_model_npi_distribution_0[npi] = np.array(npi_distribution_0[npi])/sum(npi_distribution_0[npi])
    return to_model_npi_distribution_0

In [6]:
def plot_merged(train_cluster_distrubutions_df, val_cluster_distrubutions_df, num_clusters):
    fig, ax = plt.subplots(1, 2, figsize=(10, 8)) 
    ax[0].plot(train_cluster_distrubutions_df, label='Train')
    ax[1].plot(val_cluster_distrubutions_df, label=[i for i in range(num_clusters)])
    ax[1].legend(loc='center left', bbox_to_anchor=(1, 0.5))
    plt.show()

In [7]:
def return_max_4(lis):
    lis_2 = lis[:]
    lis_2.sort()
    return lis.index(max(lis)), lis.index(lis_2[-2]), lis.index(lis_2[-3]), lis.index(lis_2[-4])

In [8]:
def plot_seperate_30(train_cluster_distrubutions, val_cluster_distrubutions, train_npi_type_distributions, val_npi_type_distributions, num_clusters, file_name):
    fig, axs = plt.subplots(num_clusters, 4, figsize=(30, num_clusters*3))
    train_keys = list(train_cluster_distrubutions.keys())
    val_keys = list(val_cluster_distrubutions.keys())
    train_type_keys = list(train_npi_type_distributions.keys())
    val_type_keys = list(val_npi_type_distributions.keys())

    x_axes = list(train_npi_type_distributions[0].keys())
    print(x_axes)

    for i in range(num_clusters):
        for j in range(4):
            if j == 0:
                key = train_keys[i]
                axs[i, j].plot(train_cluster_distrubutions[key], color='red')
                axs[i, j].set_title('Train cluster: ' + str(key))
            elif j == 1:
                key = train_type_keys[i]
                p1, p2, p3, p4 = return_max_4(list(train_npi_type_distributions[key].values()))
                axs[i, j].bar([i for i in range(len(train_npi_type_distributions[key].keys()))], list(train_npi_type_distributions[key].values()), color='red')
                axs[i, j].set_title('MOS: ' + str(x_axes[p1]) + ' || 2MOS: ' + x_axes[p2] + ' || 3MOS: ' + x_axes[p3] + ' || 4MOS: ' + x_axes[p4])
                axs[i, j].set_xticks(np.arange(0, 30))
            elif j == 2:
                key = val_keys[i]
                axs[i, j].plot(val_cluster_distrubutions[key], color='blue')
                axs[i, j].set_title('Val cluster: ' + str(key))
            else:
                key = val_type_keys[i]
                p1, p2, p3, p4 = return_max_4(list(val_npi_type_distributions[key].values()))
                axs[i, j].bar([i for i in range(len(val_npi_type_distributions[key].keys()))], list(val_npi_type_distributions[key].values()), color='blue')
                axs[i, j].set_title('MOS: ' + str(x_axes[p1]) + ' || 2MOS: ' + x_axes[p2] + ' || 3MOS: ' + x_axes[p3] + ' || 4MOS: ' + x_axes[p4])
                axs[i, j].set_xticks(np.arange(0, 30))



    # plt.tight_layout()
    # plt.show()
    plt.savefig("./k-means_experiment_results/" + file_name)

In [9]:
def get_nucc_indigo_dict(nucc_indigo_df):
    nucc_indigo_dict = {}
    for _, r in nucc_indigo_df.iterrows():
        if r['NUCC Code'] not in nucc_indigo_dict and isinstance(r['Indigo Specialty'], str):
            nucc_indigo_dict[r['NUCC Code']] = r['Indigo Specialty']
    
    return nucc_indigo_dict

In [10]:
def get_npi_indigo_spl(npi_nucc, nucc_indigo_dict):
    npi_indigo_spl = {}

    for npi in npi_nucc:
        if isinstance(npi_nucc[npi], str) and npi_nucc[npi] in nucc_indigo_dict:
            npi_indigo_spl[npi] = nucc_indigo_dict[npi_nucc[npi]]
    
    return npi_indigo_spl

In [11]:
def drop_unrelated_npi(npi_indigo_spl, npi_distribution_0):
    to_drop = []
    for npi in npi_distribution_0:
        if int(npi) not in npi_indigo_spl:
            to_drop.append(npi)
    for npi in to_drop:
        del npi_distribution_0[npi]
    return npi_distribution_0

In [12]:
def save_pickle(file_name, file_path):
    with open(file_path, 'wb') as fp:
        pickle.dump(file_name, fp)

****NORMALIZED**** DATA k-MEANS

In [13]:
to_model_npi_distribution_0 = normalize(npi_distribution_0, 50)

In [14]:
npi_distribution_0 = None

In [15]:
nucc_indigo_df = pd.read_pickle('./indigo_speciality_to_nucc_crosswalk.pkl')
nucc_indigo_dict = get_nucc_indigo_dict(nucc_indigo_df)

In [16]:
npi_nucc = pd.read_pickle('./npi_first_taxonomy_code.pkl')
npi_indigo_spl = get_npi_indigo_spl(npi_nucc, nucc_indigo_dict)

In [17]:
len(npi_indigo_spl)

5165585

In [18]:
to_model_npi_distribution_0 = drop_unrelated_npi(npi_indigo_spl, to_model_npi_distribution_0)
print(len(to_model_npi_distribution_0))

123156


In [19]:
npi, distributions = get_x_y(to_model_npi_distribution_0)
print(len(npi))
print(len(distributions))

123156
123156


In [20]:
npi_train, npi_val, distributions_train, distributions_val = train_test_split(npi, distributions, test_size=0.3, shuffle=True)

In [21]:
npi = None
distributions = None

In [22]:
def get_npi_type(npi_nucc, nucc_type):
    npi_nucc = {k:v for k, v in npi_nucc.items() if isinstance(v, str)}
    nucc_type = {k:v for k, v in nucc_type.items() if isinstance(v, str)}
    npi_type = {}

    for npi in npi_nucc:
        if npi_nucc[npi] in nucc_type:
            npi_type[npi] = nucc_type[npi_nucc[npi]]
    
    return npi_type

In [23]:
def get_cluster_npi_type(cluster, npi, npi_type, types):
    npi_type_distributions = {}
    
    for i in range(len(cluster)):
        npi_type_distributions[i] = {t:0 for t in types}
        for id in cluster[i]:
            if int(npi[id]) in npi_type:
                npi_type_distributions[i][npi_type[int(npi[id])]] += 1
    
    return npi_type_distributions

In [24]:
def get_cluster_distrubutions(clusters, distributions):
    cluster_distrubutions = {}
    for i in range(len(clusters)):
        cluster_distrubutions[i] = np.array(len(distributions[0])*[0.0])
        for id in clusters[i]:
            cluster_distrubutions[i] += distributions[id]
    return cluster_distrubutions

In [25]:
def cluster_assign(data, num_clusters, kmeans):
    cluster_assignments = kmeans.predict(data)
    clusters = [[] for _ in range(num_clusters)]
    for i, cluster_id in enumerate(cluster_assignments):
        clusters[cluster_id].append(i)
    clusters = [np.array(cluster) for cluster in clusters]
    return clusters

In [26]:
def train_kmeans_model(train_data, val_data, num_clusters, random_state):
    kmeans = KMeans(n_clusters=num_clusters, random_state=random_state)
    kmeans.fit(train_data)
    centroids = kmeans.cluster_centers_
    
    train_clusters = cluster_assign(train_data, num_clusters, kmeans)
    val_clusters = cluster_assign(val_data, num_clusters, kmeans)

    return centroids, train_clusters, val_clusters

In [57]:
def get_labels(considering_npis, npi_indigo_spl, indigo_spls):
    indigo_spls_to_pos = {indigo_spls[i]:i for i in range(len(indigo_spls))}
    npi_to_indigo_spl_pos = {}
    for npi in considering_npis:
        npi = int(npi)
        if npi in npi_indigo_spl:
            npi_to_indigo_spl_pos[npi] = indigo_spls_to_pos[npi_indigo_spl[npi]]
    return npi_to_indigo_spl_pos

In [93]:
def normalize_model_output(model_output):
    for i in range(len(model_output)):
        #print(np.size(np.array(model_output)))
        model_output[i] = np.array(model_output[i])/sum(model_output[i])
    return model_output

In [73]:
def get_indices(percent, one_output):
    indices = []
    max_index = -1
    max_score = -1
    for i in range(len(one_output)):
        if one_output[i] > percent:
            indices.append(i)
        if one_output[i] > max_score:
            max_score = one_output[i]
            max_index = i
    return indices, max_index

In [106]:
def check_labels(labels):
    labels_set = set(labels)
    print(len(labels_set))
    print(labels_set)

In [177]:
def get_classification_report(npi_indigo_spl_pos, npi_indigo_spl_distribution, val_npis, npi_pos, percent = None):
    npi_indigo_spl_distribution = normalize_model_output(npi_indigo_spl_distribution)
    if percent is None:
        percent = 1

    y_model = []
    y_label = []

    for i in range(len(npi_indigo_spl_distribution)):
        indices, max_index = get_indices(percent, npi_indigo_spl_distribution[i])
        y_label.append(npi_indigo_spl_pos[int(val_npis[i])])
        if npi_indigo_spl_pos[int(val_npis[i])] in indices:
            y_model.append(npi_indigo_spl_pos[int(val_npis[i])])
        else:
            y_model.append(max_index)
    
    check_labels(y_label)
    check_labels(y_model)
    #print(npi_pos)
    target_names = list(set(y_label))
    target_names.sort()
    print(target_names)
    
    print(classification_report(y_label, y_model, target_names=target_names))

In [28]:
num_clusters = 200
train_data = np.array(distributions_train)
val_data = np.array(distributions_val)
centroids, train_clusters, val_clusters = train_kmeans_model(train_data, val_data, num_clusters, 34)

  super()._check_params_vs_input(X, default_n_init=10)


In [29]:
#save_pickle(centroids, './k-means_centroids_200.pkl')

In [30]:
#save_pickle(val_clusters, './k-means_val_clusters_200.pkl')

In [31]:
#save_pickle(train_clusters, './k-means_train_clusters_200.pkl')

In [165]:
def top_k_probablity(cluster_distributions, test_sample, centroids):
    test_sample_probablity = []
    for sample in test_sample:
        dist = np.linalg.norm((np.array(200*[sample]) - centroids), axis=1)
        

        '''
        dist = []
        for cent in centroids:
            dist.append(np.linalg.norm(sample - cent))
            
        i = 0
        cur_probablity = np.array(len(cluster_distributions[0])*[0.0])
        for distribution in cluster_distributions:
            cur_probablity += (1/dist[i])*distribution
            i += 1
        '''

        test_sample_probablity.append(cluster_distributions[list(dist).index(min(dist))])
    return test_sample_probablity

In [38]:
train_cluster_distrubutions = get_cluster_distrubutions(train_clusters, distributions_train)
val_cluster_distrubutions = get_cluster_distrubutions(val_clusters, distributions_val)

In [46]:
#save_pickle(val_sample_probablity, './k-means_val_sample_probablity.pkl')

In [70]:
val_sample_probablity = None

In [58]:
train_cluster_distrubutions_df = pd.DataFrame(train_cluster_distrubutions)
val_cluster_distrubutions_df = pd.DataFrame(val_cluster_distrubutions)

In [29]:
#npi_nucc = pd.read_pickle('./npi_first_taxonomy_code.pkl')
#nucc_type = pd.read_pickle('./nucc_type.pkl')
#types = pd.read_pickle('./types.pkl')

In [109]:
indigo_spls = list(set(npi_indigo_spl.values()))
len(indigo_spls)

93

In [48]:
#npi_type = get_npi_type(npi_nucc, nucc_type)
indigo_spls = list(set(npi_indigo_spl.values()))
train_npi_type_distributions = get_cluster_npi_type(train_clusters, npi_train, npi_indigo_spl, indigo_spls)
val_npi_type_distributions = get_cluster_npi_type(val_clusters, npi_val, npi_indigo_spl, indigo_spls)

In [136]:
list(train_npi_type_distributions[0].values())

[0,
 5,
 1,
 1,
 4,
 0,
 188,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 7,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 43,
 0,
 28,
 7,
 0,
 62,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 24,
 5,
 3,
 0,
 1,
 5,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 33,
 0,
 10,
 0,
 5,
 0,
 0,
 0,
 19,
 0,
 0,
 0,
 0,
 7,
 0,
 0,
 0,
 0,
 0,
 0,
 23,
 0,
 0,
 0,
 6]

In [138]:
len(centroids)

200

In [153]:
np.shape(centroids)

(200, 17572)

In [166]:
val_sample_probablity = top_k_probablity(train_npi_type_distributions, distributions_val, centroids)

In [168]:
save_pickle(val_sample_probablity, './val_sample_probablity_3.pkl')

In [171]:
val_sample_probablity_lis = []
for ele in val_sample_probablity:
    val_sample_probablity_lis.append(list(ele.values()))

In [80]:
npi_indigo_spl_pos = get_labels(npi_val, npi_indigo_spl, indigo_spls)

In [82]:
npi_indigo_spl_pos[1609816107]

53

In [128]:
val_sample_probablity = pd.read_pickle('./val_sample_probablity_2.pkl')

In [123]:
len(val_sample_probablity[0])

93

In [178]:
get_classification_report(npi_indigo_spl_pos, val_sample_probablity_lis, npi_val, npi_indigo_spl_pos)

88
{0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 85, 87, 88, 89, 90, 91, 92}
38
{0, 6, 9, 17, 18, 19, 21, 23, 24, 25, 33, 34, 38, 39, 41, 44, 45, 47, 51, 53, 54, 55, 56, 58, 62, 66, 67, 69, 70, 73, 74, 75, 77, 81, 82, 83, 85, 88}
[0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 85, 87, 88, 89, 90, 91, 92]


  _warn_prf(average, modifier, msg_start, len(result))


TypeError: object of type 'int' has no len()

In [133]:
val_sample_probablity[4284]

array([98.26532764, 98.26532764, 98.26532764, 98.26532764, 98.26532764,
       98.26532764, 98.26532764, 98.26532764, 98.26532764, 98.26532764,
       98.26532764, 98.26532764, 98.26532764, 98.26532764, 98.26532764,
       98.26532764, 98.26532764, 98.26532764, 98.26532764, 98.26532764,
       98.26532764, 98.26532764, 98.26532764, 98.26532764, 98.26532764,
       98.26532764, 98.26532764, 98.26532764, 98.26532764, 98.26532764,
       98.26532764, 98.26532764, 98.26532764, 98.26532764, 98.26532764,
       98.26532764, 98.26532764, 98.26532764, 98.26532764, 98.26532764,
       98.26532764, 98.26532764, 98.26532764, 98.26532764, 98.26532764,
       98.26532764, 98.26532764, 98.26532764, 98.26532764, 98.26532764,
       98.26532764, 98.26532764, 98.26532764, 98.26532764, 98.26532764,
       98.26532764, 98.26532764, 98.26532764, 98.26532764, 98.26532764,
       98.26532764, 98.26532764, 98.26532764, 98.26532764, 98.26532764,
       98.26532764, 98.26532764, 98.26532764, 98.26532764, 98.26

In [69]:
len(train_npi_type_distributions)

200

In [58]:
npi_indigo_spl_pos = get_labels(npi_val, npi_indigo_spl, indigo_spls)

In [None]:
plot_merged(train_cluster_distrubutions_df, val_cluster_distrubutions_df, num_clusters)

In [147]:
plot_seperate_30(train_cluster_distrubutions, val_cluster_distrubutions)

TypeError: plot_seperate_30() missing 2 required positional arguments: 'train_npi_type_distributions' and 'val_npi_type_distributions'

In [50]:
val_npi_type_distributions

{0: {'Occupational Medicine': 0,
  'Neonatology': 0,
  'Ophthalmic Assistant': 0,
  'Dietitian': 0,
  'Radiology Diagnostic-Minor Surgery': 0,
  'Surgeon Assistant': 0,
  'Endocrinology-No Surgery': 2,
  'Medical Assistant': 0,
  'Podiatrist': 0,
  'Pharmacology Clinical': 0,
  'Respiratory Therapist': 0,
  'Hematology-No Surgery': 1,
  'Acupuncture': 0,
  'Orthopedic Excl Back-Surgery': 0,
  'Dermatology-Minor Surgery': 0,
  'Pain Medicine': 0,
  'Geriatrics-No Surgery': 0,
  'Nurse': 0,
  'Family Medicine-No Surgery': 2,
  'Phlebology': 0,
  'O.R. Technician': 0,
  'Nuclear Medicine': 0,
  'Neurology-Surgery': 0,
  'Allergy': 1,
  'Pediatrics-Minor Surgery': 0,
  'EKG Technician': 0,
  'Rheumatology-No Surgery': 0,
  'Anesthesiology Assistant (AA)': 0,
  'Gynecology-Surgery': 0,
  'Nurse - Student': 1,
  'Neurology-No Surgery': 0,
  'Advanced Practice Registered Nurse (APRN)': 2,
  'Pathology-No Surgery': 0,
  'Optometrist': 0,
  'Emergency Medical Technician (EMT)': 0,
  'Otorhinola

In [53]:
import itertools

def divide_dict(data, n):
    it = iter(data)
    size = len(data) // n

    for i in range(0, len(data), size):
        yield {k: data[k] for k in itertools.islice(it, size)}

In [None]:
train_cluster_distrubutions_divisions = divide_dict(train_cluster_distrubutions, 5)
val_cluster_distrubutions_divisions = divide_dict(val_cluster_distrubutions, 5)
for train_cluster_distrubutions, val_cluster_distrubutions in zip(train_cluster_distrubutions_divisions, val_cluster_distrubutions_divisions):
    plot_seperate_30(train_cluster_distrubutions, val_cluster_distrubutions, train_npi_type_distributions, val_npi_type_distributions, int(num_clusters/5), '')

In [None]:
file_name = 'clusters_500_run_5.svg'
plot_seperate_30(train_cluster_distrubutions, val_cluster_distrubutions, train_npi_type_distributions, val_npi_type_distributions, num_clusters, file_name)

In [193]:
lis = ['Agencies', 'Allopathic & Osteopathic Physicians', 'Ambulatory Health Care Facilities', 'Behavioral Health & Social Service Providers', 'Chiropractic Providers', 'Dental Providers', 'Dietary & Nutritional Service Providers', 'Emergency Medical Service Providers', 'Eye and Vision Services Providers', 'Group', 'Hospital Units', 'Hospitals', 'Laboratories', 'Managed Care Organizations', 'Nursing & Custodial Care Facilities', 'Nursing Service Providers', 'Nursing Service Related Providers', 'Other Service Providers', 'Pharmacy Service Providers', 'Physician Assistants & Advanced Practice Nursing Providers', 'Podiatric Medicine & Surgery Service Providers', 'Residential Treatment Facilities', 'Respiratory, Developmental, Rehabilitative and Restorative Service Providers', 'Respite Care Facility', 'Speech, Language and Hearing Service Providers', 'Student, Health Care', 'Suppliers', 'Technologists, Technicians & Other Technical Service Providers', 'This material, including the Health Care Provider Taxonomy Code Set, is published in cooperation with the National Uniform Claim Committee (NUCC) by the American Medical Association (AMA).  Permission is granted for any non-commercial use of this material as long as the copyright notice and other disclaimers are included in any copy and the contents are not changed.  For commercial use, including sales or licensing, a license must be obtained at www.nucc.org.  The AMA, NUCC, and any of its members shall not be responsible for any liability in connection with use of this material.  (This material is provided \x93As Is\x94 without warranty of any kind.)  Applicable FARS/DFARS restrictions apply.', 'Transportation Services']
print({lis[i]:i for i in range(len(lis))})

{'Agencies': 0, 'Allopathic & Osteopathic Physicians': 1, 'Ambulatory Health Care Facilities': 2, 'Behavioral Health & Social Service Providers': 3, 'Chiropractic Providers': 4, 'Dental Providers': 5, 'Dietary & Nutritional Service Providers': 6, 'Emergency Medical Service Providers': 7, 'Eye and Vision Services Providers': 8, 'Group': 9, 'Hospital Units': 10, 'Hospitals': 11, 'Laboratories': 12, 'Managed Care Organizations': 13, 'Nursing & Custodial Care Facilities': 14, 'Nursing Service Providers': 15, 'Nursing Service Related Providers': 16, 'Other Service Providers': 17, 'Pharmacy Service Providers': 18, 'Physician Assistants & Advanced Practice Nursing Providers': 19, 'Podiatric Medicine & Surgery Service Providers': 20, 'Residential Treatment Facilities': 21, 'Respiratory, Developmental, Rehabilitative and Restorative Service Providers': 22, 'Respite Care Facility': 23, 'Speech, Language and Hearing Service Providers': 24, 'Student, Health Care': 25, 'Suppliers': 26, 'Technologis