In [1]:
import pandas as pd
from collections import defaultdict
import csv
from itertools import combinations

In [2]:
# Loading the dataset
df = pd.read_csv('claims_synthetic_dataset(1_million).csv')

In [3]:
# Defining the number of CPT codes per claim
num_cpt_codes = 10

In [4]:
# Function to find the most commonly occurring CPT code in the dataset
def suggest_common_cpts(num_suggestions):
    cpt_counts = defaultdict(int)
    for _, row in df.iterrows():
        cpt_codes = [str(row[f'CPT Code {i+1}']) for i in range(num_cpt_codes)]
        for cpt in cpt_codes:
            cpt_counts[cpt] += 1
    
    sorted_cpt_counts = sorted(cpt_counts.items(), key=lambda x: x[1], reverse=True)
    suggested_cpts = [cpt for cpt, _ in sorted_cpt_counts[:num_suggestions]]
    suggested_cpt_occurrences = [cpt_counts[cpt] for cpt in suggested_cpts]

    return suggested_cpts, suggested_cpt_occurrences

In [5]:
# Suggesting the most commonly occurring CPT code in the dataset
num_common_suggestions = 10
common_cpts, common_cpt_occurrences = suggest_common_cpts(num_common_suggestions)

In [6]:
# Displaying the results
print(f"\nThe most commonly occurring CPT codes in the dataset are:")
for i, (cpt, occurrences) in enumerate(zip(common_cpts, common_cpt_occurrences), 1):
    print(f"{i}: CPT {cpt} (Occurrences: {occurrences})")


The most commonly occurring CPT codes in the dataset are:
1: CPT 86288 (Occurrences: 156)
2: CPT 20183 (Occurrences: 155)
3: CPT 94137 (Occurrences: 154)
4: CPT 68613 (Occurrences: 154)
5: CPT 24615 (Occurrences: 154)
6: CPT 29700 (Occurrences: 154)
7: CPT 65659 (Occurrences: 154)
8: CPT 30139 (Occurrences: 153)
9: CPT 17210 (Occurrences: 153)
10: CPT 65567 (Occurrences: 153)


In [7]:
# Function to find the most common CPT codes pair that co-occur within the same claim
def find_common_cpt_pairs():
    cpt_pairs = defaultdict(int)
    for _, row in df.iterrows():
        cpt_codes = [str(row[f'CPT Code {i+1}']) for i in range(num_cpt_codes)]
        for i in range(num_cpt_codes):
            for j in range(i+1, num_cpt_codes):
                pair = (cpt_codes[i], cpt_codes[j])
                cpt_pairs[pair] += 1

    sorted_cpt_pairs = sorted(cpt_pairs.items(), key=lambda x: x[1], reverse=True)
    most_common_pairs = sorted_cpt_pairs[:8]  
    return most_common_pairs

In [8]:
# Finding the most common CPT code pairs
most_common_cpt_pairs = find_common_cpt_pairs()

In [9]:
# Displaying the results
print("\nThe most common CPT code pairs that occur within the same claim are:")
for i, ((cpt1, cpt2), occurrences) in enumerate(most_common_cpt_pairs, 1):
    print(f"Pair {i}: CPT {cpt1} and CPT {cpt2} (Occurrences: {occurrences})")


The most common CPT code pairs that occur within the same claim are:
Pair 1: CPT 17695 and CPT 83895 (Occurrences: 4)
Pair 2: CPT 21414 and CPT 70798 (Occurrences: 4)
Pair 3: CPT 21191 and CPT 93161 (Occurrences: 3)
Pair 4: CPT 46114 and CPT 94131 (Occurrences: 3)
Pair 5: CPT 53434 and CPT 47100 (Occurrences: 3)
Pair 6: CPT 90751 and CPT 16812 (Occurrences: 3)
Pair 7: CPT 40722 and CPT 36043 (Occurrences: 3)
Pair 8: CPT 55214 and CPT 30459 (Occurrences: 3)


In [10]:
# Function to suggest multiple sets of three CPT codes that occur within the same claim
def suggest_common_cooccurring_cpts(num_suggestions):
    cpt_counts = defaultdict(int)
    for _, row in df.iterrows():
        cpt_codes = [str(row[f'CPT Code {i+1}']) for i in range(num_cpt_codes)]
        for combo in combinations(cpt_codes, 3):
            cpt_counts[combo] += 1
    
    sorted_cpt_counts = sorted(cpt_counts.items(), key=lambda x: x[1], reverse=True)
    suggested_cpts = [cpt for cpt, _ in sorted_cpt_counts[:num_suggestions]]
    suggested_cpt_occurrences = [cpt_counts[cpt] for cpt in suggested_cpts]

    return suggested_cpts, suggested_cpt_occurrences

In [11]:
# Suggesting multiple sets of three CPT codes that occur within the same claim
num_common_suggestions = 5
common_cpts, common_cpt_occurrences = suggest_common_cooccurring_cpts(num_common_suggestions)

In [12]:
# Displaying the results
print(f"\nThe most commonly co-occuring sets of three CPT codes are:")
for i, (cpts, occurrences) in enumerate(zip(common_cpts, common_cpt_occurrences), 1):
    cpt1, cpt2, cpt3 = cpts
    print(f"Suggestion {i}: CPT {cpt1}, CPT {cpt2}, CPT {cpt3} (Occurrences: {occurrences})")


The most commonly co-occuring sets of three CPT codes are:
Suggestion 1: CPT 53301, CPT 37342, CPT 65491 (Occurrences: 2)
Suggestion 2: CPT 31774, CPT 16141, CPT 74400 (Occurrences: 2)
Suggestion 3: CPT 78976, CPT 18055, CPT 27134 (Occurrences: 2)
Suggestion 4: CPT 62823, CPT 13076, CPT 99700 (Occurrences: 2)
Suggestion 5: CPT 10441, CPT 81839, CPT 87757 (Occurrences: 2)


In [13]:
# Function to suggest the most co-occurring next CPT codes based on selected CPT codes
def suggest_next_cpt(selected_cpt_codes, num_suggestions):
    cooccurrence_row_counts = defaultdict(int) 
    
    for _, row in df.iterrows():
        row_cpt_codes = [str(row[f'CPT Code {i+1}']) for i in range(num_cpt_codes)]
        if set(selected_cpt_codes).issubset(row_cpt_codes):
            for cpt in row_cpt_codes:
                if cpt not in selected_cpt_codes:
                    cooccurrence_row_counts[cpt] += 1

    sorted_next_cpts = sorted(cooccurrence_row_counts.items(), key=lambda x: x[1], reverse=True)
    suggested_cpts = [cpt for cpt, _ in sorted_next_cpts[:num_suggestions]]
    suggested_cpt_occurrences = [cooccurrence_row_counts[cpt] for cpt in suggested_cpts]

    return suggested_cpts, suggested_cpt_occurrences

In [14]:
# Function to check if the selected CPT codes are valid
def is_valid_cpt(cpt_codes):
    for cpt in cpt_codes:
        if not (10000 <= int(cpt) <= 99999):
            return False
    return True

In [15]:
# Example usage
selected_cpt_codes = ['21414']
if not is_valid_cpt(selected_cpt_codes):
    print("Invalid CPT code(s) entered. CPT codes should be in the range of 10000 to 99999.")
else:
    num_suggestions = 3
    suggested_cpts, suggested_cpt_occurrences = suggest_next_cpt(selected_cpt_codes, num_suggestions)

In [16]:
# Displaying the results
print(f"The most co-occurring next CPT codes for {selected_cpt_codes} on the basis of co-occurrences are:")
for i, (cpt, occurrences) in enumerate(zip(suggested_cpts, suggested_cpt_occurrences), 1):
    print(f"Suggestion {i}: CPT {cpt} (Occurrences: {occurrences})")


The most co-occurring next CPT codes for ['21414'] on the basis of co-occurrences are:
Suggestion 1: CPT 70798 (Occurrences: 4)
Suggestion 2: CPT 25890 (Occurrences: 2)
Suggestion 3: CPT 11216 (Occurrences: 2)


In [17]:
# Verifying above results
cpt_codes = ['70798','21414']

filename = 'claims_synthetic_dataset(1_million).csv'

matching_row_count = 0

with open(filename, 'r') as file:
    reader = csv.DictReader(file)
    for row in reader:
        if all(cpt_code in row.values() for cpt_code in cpt_codes):
            matching_row_count += 1

print(f"Number of rows containing both CPT codes {', '.join(cpt_codes)}: {matching_row_count}")

Number of rows containing both CPT codes 70798, 21414: 4
