In [31]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.neighbors import NearestNeighbors

In [32]:
# Loading the dataset
df = pd.read_csv('claims_synthetic_dataset(200000try).csv')

In [33]:
# Preprocessing the data
mlb = MultiLabelBinarizer()
cpt_features = mlb.fit_transform(df.loc[:, df.columns.str.startswith('CPT Code')].astype(str).values)
df_encoded = pd.DataFrame(cpt_features, columns=mlb.classes_)

In [34]:
# Function to suggest the most co-occurring next CPT codes based on selected CPT codes using k-nearest neighbors
def suggest_next_cpt(selected_cpt_codes, num_suggestions):
    selected_cpt_features = mlb.transform([selected_cpt_codes])

    # Finding the most similar rows using k-nearest neighbors
    n_neighbors = min(len(df), num_suggestions)
    nn = NearestNeighbors(n_neighbors=n_neighbors, algorithm='brute').fit(df_encoded.values)
    _, indices = nn.kneighbors(selected_cpt_features)

    suggested_cpts = []
    suggested_cpt_occurrences = []

    # Counting the occurrences of next CPT codes in similar rows
    cpt_counts = {}
    for idx in indices[0]:
        row = df_encoded.iloc[idx]
        next_cpts = row[row == 1].index.tolist()
        for cpt in next_cpts:
            if cpt not in selected_cpt_codes:
                if cpt not in cpt_counts:
                    cpt_counts[cpt] = 1
                else:
                    cpt_counts[cpt] += 1

    # Sorting the suggested CPT codes based on occurrence count
    sorted_cpt_counts = sorted(cpt_counts.items(), key=lambda x: x[1], reverse=True)
    for cpt, occurrences in sorted_cpt_counts[:num_suggestions]:
        suggested_cpts.append(cpt)
        suggested_cpt_occurrences.append(occurrences)

    return suggested_cpts, suggested_cpt_occurrences

In [37]:
# Example usage
selected_cpt_codes = ['99910','99977','99947']
num_suggestions = 5
suggested_cpts, suggested_cpt_occurrences = suggest_next_cpt(selected_cpt_codes, num_suggestions)

In [38]:
# Displaying the results
print(f"The most co-occurring next CPT codes for {selected_cpt_codes} are:")
for i, (cpt, occurrences) in enumerate(zip(suggested_cpts, suggested_cpt_occurrences), 1):
    print(f"Suggestion {i}: CPT {cpt} (Occurrences: {occurrences})")

The most co-occurring next CPT codes for ['99910', '99977', '99947'] are:
Suggestion 1: CPT 99960 (Occurrences: 2)
Suggestion 2: CPT 99965 (Occurrences: 2)
Suggestion 3: CPT 99914 (Occurrences: 2)
Suggestion 4: CPT 99945 (Occurrences: 1)
Suggestion 5: CPT 99953 (Occurrences: 1)
