In [1]:
import pandas as pd
from collections import defaultdict
import random
import csv
from itertools import combinations

In [2]:
#Creating 200000 datasets

#Number of Claims and number of CPT codes for each claim
num_claims = 200000
num_cpt_codes = 8
filename = 'claims_synthetic_dataset(200000try).csv'

In [3]:
#Generating random CPT codes for each claim
data = []
for i in range(num_claims):
    claim_cpt_codes = set()
    while len(claim_cpt_codes) < num_cpt_codes:
        cpt_code = f'{random.randint(99900, 99999):05d}'
        claim_cpt_codes.add(cpt_code)
    claim_data = {'Claim': f'Claim {i+1}'}
    for j, cpt_code in enumerate(claim_cpt_codes):
        claim_data[f'CPT Code {j+1}'] = cpt_code
    data.append(claim_data)

In [4]:
# Writing data to CSV file
with open(filename, 'w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=['Claim'] + [f'CPT Code {j+1}' for j in range(num_cpt_codes)])
    writer.writeheader()
    writer.writerows(data)

print(f"Dataset with {num_claims} claims and {num_cpt_codes} unique CPT codes per claim saved to {filename} successfully!!!")

Dataset with 200000 claims and 8 unique CPT codes per claim saved to claims_synthetic_dataset(200000try).csv successfully!!!


In [5]:
# Loading the dataset
df = pd.read_csv('claims_synthetic_dataset(200000try).csv')

In [6]:
# Function to suggest the most commonly occurring CPT codes in the dataset
def suggest_common_cpts(num_suggestions):
    cpt_counts = defaultdict(int)
    for _, row in df.iterrows():
        cpt_codes = [str(row[f'CPT Code {i+1}']) for i in range(num_cpt_codes)]
        for cpt in cpt_codes:
            cpt_counts[cpt] += 1
    
    sorted_cpt_counts = sorted(cpt_counts.items(), key=lambda x: x[1], reverse=True)
    suggested_cpts = [cpt for cpt, _ in sorted_cpt_counts[:num_suggestions]]
    suggested_cpt_occurrences = [cpt_counts[cpt] for cpt in suggested_cpts]

    return suggested_cpts, suggested_cpt_occurrences

In [7]:
# Suggesting the most commonly occurring CPT codes in the dataset
num_common_suggestions = 10
common_cpts, common_cpt_occurrences = suggest_common_cpts(num_common_suggestions)

In [8]:
# Displaying the results
print(f"\nThe most commonly occurring CPT codes in the dataset are:")
for i, (cpt, occurrences) in enumerate(zip(common_cpts, common_cpt_occurrences), 1):
    print(f" {i}: CPT {cpt} (Occurrences: {occurrences})")


The most commonly occurring CPT codes in the dataset are:
 1: CPT 99934 (Occurrences: 16240)
 2: CPT 99907 (Occurrences: 16218)
 3: CPT 99977 (Occurrences: 16208)
 4: CPT 99991 (Occurrences: 16200)
 5: CPT 99956 (Occurrences: 16179)
 6: CPT 99967 (Occurrences: 16157)
 7: CPT 99925 (Occurrences: 16156)
 8: CPT 99929 (Occurrences: 16137)
 9: CPT 99928 (Occurrences: 16129)
 10: CPT 99927 (Occurrences: 16124)


In [9]:
# Function to find the most common pair of CPT codes that occur within the same claim
def find_common_cpt_pairs():
    cpt_pairs = defaultdict(int)
    for _, row in df.iterrows():
        cpt_codes = [str(row[f'CPT Code {i+1}']) for i in range(num_cpt_codes)]
        for i in range(num_cpt_codes):
            for j in range(i+1, num_cpt_codes):
                pair = (cpt_codes[i], cpt_codes[j])
                cpt_pairs[pair] += 1

    sorted_cpt_pairs = sorted(cpt_pairs.items(), key=lambda x: x[1], reverse=True)
    most_common_pairs = sorted_cpt_pairs[:8] 
    return most_common_pairs

In [10]:
# Finding the most common CPT code pairs
most_common_cpt_pairs = find_common_cpt_pairs()

In [11]:
# Displaying the results
print("\nThe most common CPT code pairs that occur within the same claim are:")
for i, ((cpt1, cpt2), occurrences) in enumerate(most_common_cpt_pairs, 1):
    print(f"Pair {i}: CPT {cpt1} and CPT {cpt2} (Occurrences: {occurrences})")


The most common CPT code pairs that occur within the same claim are:
Pair 1: CPT 99910 and CPT 99959 (Occurrences: 1269)
Pair 2: CPT 99934 and CPT 99929 (Occurrences: 1263)
Pair 3: CPT 99934 and CPT 99953 (Occurrences: 1244)
Pair 4: CPT 99967 and CPT 99949 (Occurrences: 1243)
Pair 5: CPT 99950 and CPT 99977 (Occurrences: 1237)
Pair 6: CPT 99967 and CPT 99928 (Occurrences: 1231)
Pair 7: CPT 99935 and CPT 99943 (Occurrences: 1227)
Pair 8: CPT 99929 and CPT 99979 (Occurrences: 1226)


In [12]:
# Function to suggest multiple sets of three CPT codes that occur in the same row
def suggest_common_cooccurring_cpts(num_suggestions):
    cpt_counts = defaultdict(int)
    for _, row in df.iterrows():
        cpt_codes = [str(row[f'CPT Code {i+1}']) for i in range(num_cpt_codes)]
        for combo in combinations(cpt_codes, 3):
            cpt_counts[combo] += 1
    
    sorted_cpt_counts = sorted(cpt_counts.items(), key=lambda x: x[1], reverse=True)
    suggested_cpts = [cpt for cpt, _ in sorted_cpt_counts[:num_suggestions]]
    suggested_cpt_occurrences = [cpt_counts[cpt] for cpt in suggested_cpts]

    return suggested_cpts, suggested_cpt_occurrences

In [13]:
# Suggesting multiple sets of three CPT codes that occur in the same row
num_common_suggestions = 5
common_cpts, common_cpt_occurrences = suggest_common_cooccurring_cpts(num_common_suggestions)

In [14]:
# Display the results
print(f"\nThe most commonly occurring sets of three CPT codes are:")
for i, (cpts, occurrences) in enumerate(zip(common_cpts, common_cpt_occurrences), 1):
    cpt1, cpt2, cpt3 = cpts
    print(f"Suggestion {i}: CPT {cpt1}, CPT {cpt2}, CPT {cpt3} (Occurrences: {occurrences})")


The most commonly occurring sets of three CPT codes are:
Suggestion 1: CPT 99912, CPT 99916, CPT 99982 (Occurrences: 106)
Suggestion 2: CPT 99934, CPT 99980, CPT 99936 (Occurrences: 104)
Suggestion 3: CPT 99912, CPT 99958, CPT 99979 (Occurrences: 104)
Suggestion 4: CPT 99961, CPT 99993, CPT 99936 (Occurrences: 103)
Suggestion 5: CPT 99965, CPT 99933, CPT 99900 (Occurrences: 103)


In [15]:
# Function to suggest the most co-occurring next CPT codes based on selected CPT codes
def suggest_next_cpt(selected_cpt_codes, num_suggestions):
    cooccurrence_row_counts = defaultdict(int)  
    for _, row in df.iterrows():
        row_cpt_codes = [str(row[f'CPT Code {i+1}']) for i in range(num_cpt_codes)]
        if set(selected_cpt_codes).issubset(row_cpt_codes):
            for cpt in row_cpt_codes:
                if cpt not in selected_cpt_codes:
                    cooccurrence_row_counts[cpt] += 1

    sorted_next_cpts = sorted(cooccurrence_row_counts.items(), key=lambda x: x[1], reverse=True)
    suggested_cpts = [cpt for cpt, _ in sorted_next_cpts[:num_suggestions]]
    suggested_cpt_occurrences = [cooccurrence_row_counts[cpt] for cpt in suggested_cpts]

    return suggested_cpts, suggested_cpt_occurrences

In [16]:
# Function to check if the selected CPT codes are valid
def is_valid_cpt(cpt_codes):
    for cpt in cpt_codes:
        if not (99900 <= int(cpt) <= 99999):
            return False
    return True

In [17]:
# Example usage
selected_cpt_codes = ['9992','99967','99964']
if not is_valid_cpt(selected_cpt_codes):
    print("Invalid CPT code(s) entered. CPT codes should be in the range of 99900 to 99999.")
else:
    num_suggestions = 3
    suggested_cpts, suggested_cpt_occurrences = suggest_next_cpt(selected_cpt_codes, num_suggestions)

Invalid CPT code(s) entered. CPT codes should be in the range of 99900 to 99999.


In [18]:
# Example usage
selected_cpt_codes = ['99912','99916','99982']
if not is_valid_cpt(selected_cpt_codes):
    print("Invalid CPT code(s) entered. CPT codes should be in the range of 92100 to 92200.")
else:
    num_suggestions = 3
    suggested_cpts, suggested_cpt_occurrences = suggest_next_cpt(selected_cpt_codes, num_suggestions)

In [19]:
# Displaying the results
print(f"The most co-occurring next CPT codes for {selected_cpt_codes} are:")
for i, (cpt, occurrences) in enumerate(zip(suggested_cpts, suggested_cpt_occurrences), 1):
    print(f"Suggestion {i}: CPT {cpt} (Occurrences: {occurrences})")

The most co-occurring next CPT codes for ['99912', '99916', '99982'] are:
Suggestion 1: CPT 99960 (Occurrences: 10)
Suggestion 2: CPT 99930 (Occurrences: 10)
Suggestion 3: CPT 99929 (Occurrences: 10)


In [21]:
#Verifying above result

filename = 'claims_synthetic_dataset(200000try).csv'

cpt_codes = ['99912','99916','99982','99960']

matching_row_count = 0

with open(filename, 'r') as file:
    reader = csv.DictReader(file)
    for row in reader:
        if all(cpt_code in row.values() for cpt_code in cpt_codes):
            matching_row_count += 1

print(f"Number of rows containing CPT codes {', '.join(cpt_codes)}: {matching_row_count}")

Number of rows containing CPT codes 99912, 99916, 99982, 99960: 10
