Make sure you have MIMIC-IV datasets downloaded, particularly discharge.csv and diagnoses.csv.

In [None]:
import numpy as np
import pandas as pd
from collections import defaultdict
import random

In [None]:
cd ~/Downloads

In [None]:
N = 5000

#Load MIMIC IV data (clinical notes and diagnoses)
discharge_df = pd.read_csv('discharge.csv')
diagnoses_df = pd.read_csv('diagnoses_icd.csv')  
diagnoses_df = diagnoses_df.query("icd_version == 10")

#Filter clinical notes
icd_10_hadm_ids = random.sample(list(set(diagnoses_df["hadm_id"].values.tolist())), N)
discharge_df = discharge_df[discharge_df["hadm_id"].isin(icd_10_hadm_ids)]

#Load filtered hadm and subject ids into new data frame
hadm_to_subject_id = dict()
for index, entry in diagnoses_df.iterrows():
    if (entry["hadm_id"] in hadm_to_subject_id):
        continue
    else:
        hadm_to_subject_id[entry["hadm_id"]] = entry["subject_id"]
icd_10_subject_ids = [hadm_to_subject_id[hadm_id] for hadm_id in icd_10_hadm_ids]
query_df = pd.DataFrame()
query_df["hadm_id"] = icd_10_hadm_ids
query_df["subject_id"] = icd_10_subject_ids

#Add ICD codes to new data frame
hadm_to_icd = dict()
for i in icd_10_hadm_ids:
    icd_codes = diagnoses_df.loc[diagnoses_df['hadm_id'] == i, 'icd_code'].values.tolist()
    icd_code_string = ""
    for code in icd_codes:
        icd_code_string += code + " "
    hadm_to_icd[i] = icd_code_string.strip()
icd_10_codes = [hadm_to_icd[hadm_id] for hadm_id in icd_10_hadm_ids]
query_df["icd_codes"] = icd_10_codes

# text, ICD-10 mapping for eval purposes
merged_df = pd.merge(discharge_df, query_df, on='hadm_id')
eval_df = merged_df[['text', 'icd_codes']]

In [None]:
# Given OAI costs, we are using a subset, 500 randomly sampled examples

# We also keep our notes used to around the average length, the average word count is:
# Average word count: 1754.476, as seen a cell down below
import random

# Function to calculate word count
def count_words(text):
    return len(text.split())

# Filter rows with word count under 2000
filtered_df = eval_df[eval_df['text'].apply(count_words) < 2000]

# Randomly sample 500 rows
sampled_df = filtered_df.sample(n=500, random_state=42)

# Display the sampled DataFrame
sampled_df

## Determine longest texts for LLM context window purposes

In [None]:
# Sort the DataFrame by text length in descending order
sorted_df = sampled_df.assign(text_length=sampled_df['text'].str.len()).sort_values('text_length', ascending=False)

# Get the top 10 longest texts and their indices
top_10_longest_texts = sorted_df.head(10)['text'].tolist()
top_10_longest_texts_indices = sorted_df.head(10).index.tolist()

# Print the top 10 longest texts with their indices
# print("Top 10 longest text indices:")
# for i, (index, text) in enumerate(zip(top_10_longest_texts_indices, top_10_longest_texts)):
#     print(f"Index: {index}")

# Make a copy of the DataFrame to avoid SettingWithCopyWarning
sampled_df_copy = sampled_df.copy()

# Calculate word count for each text
sampled_df_copy['word_count'] = sampled_df_copy['text'].apply(lambda x: len(str(x).split()))

# Calculate average word count
average_word_count = sampled_df_copy['word_count'].mean()
print(f"Average word count: {average_word_count}")

# Sort the DataFrame by word count in descending order
sampled_df = sampled_df_copy.sort_values('word_count', ascending=False)

# Get the top 10 texts with the highest word counts and their indices
top_10_word_counts = sampled_df.head(10)['word_count'].tolist()
top_10_word_counts_indices = sampled_df.head(10).index.tolist()

# Print the top 10 texts with the highest word counts and their indices
print("Top 10 highest word counts:")
for i, (index, word_count) in enumerate(zip(top_10_word_counts_indices, top_10_word_counts)):
    print(f"Index: {index}, Text {i+1}: {word_count} words")


## Determine 50 most common ICD-10 codes in dataset, misc. dataset stats

In [None]:
# vvv supresses output for conciseness vvv
%%capture

from collections import Counter

# Split the space-delimited ICD-10 codes into a list
codes_list = sampled_df['icd_codes'].str.split()

# Flatten the list of ICD-10 codes
flattened_codes = [code for sublist in codes_list for code in sublist]

# Calculate the number of medical codes per text
num_codes_per_text = codes_list.apply(len)

# Calculate the average number of medical codes per text
average_codes_per_text = num_codes_per_text.mean()

# Print the average number of medical codes per text
print(f"Average number of medical codes per text: {average_codes_per_text:.2f}")

# Count the occurrences of each ICD-10 code
code_counter = Counter(flattened_codes)

# Get the 50 most common ICD-10 codes
top_50_common_codes = code_counter.most_common(50)

# Create an ordered list of codes
ordered_codes = [code for code, _ in top_50_common_codes]

# Create a dictionary mapping code to frequency count
code_freq_dict = {code: count for code, count in top_50_common_codes}

# Print the ordered list of codes
print("Ordered List of Codes:")
for i, code in enumerate(ordered_codes):
    print(f"{i+1}. {code}")

# Print the dictionary mapping code to frequency count
print("\nCode to Frequency Count Dictionary:")
for code, count in code_freq_dict.items():
    print(f"{code}: {count}")

## Baseline: Predict top 16 codes 

In [None]:
from sklearn.metrics import roc_auc_score, f1_score, precision_score
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np

top_16 = ordered_codes[:16]

# Assuming that 'codes' column in dataframe and top_16 codes are space-delimited strings
def process_codes(codes):
    return set(codes.split())

# Create binary representations for each code string and top_16
mlb = MultiLabelBinarizer()
mlb.fit([process_codes(code) for code in sampled_df['icd_codes']] + [set(top_16)])

y_true = []
for line in sampled_df['icd_codes']:
    y_true.append(line.split())
    
y_true = mlb.transform(y_true)
y_pred = mlb.transform([set(top_16) for _ in range(500)])

# Calculate metrics
micro_auc = roc_auc_score(y_true, y_pred, average='micro')
macro_auc = roc_auc_score(y_true, y_pred, average='macro')
micro_f1 = f1_score(y_true, y_pred, average='micro')
macro_f1 = f1_score(y_true, y_pred, average='macro')


def precision_at_k(y_true, y_pred, k):
    precisions = []
    for true_codes in y_true:
        top_k_preds = y_pred[:k]  # Get the top k predictions
        # Count the number of correct predictions
        correct_preds = sum([1 for code in top_k_preds if code in true_codes])
        # Calculate precision and append it to the list
        precisions.append(correct_preds / len(top_k_preds))
    return np.mean(precisions)


# Convert y_true back to the set representation
y_true_sets = [set(codes) for codes in mlb.inverse_transform(y_true)]

# The top_16 codes
y_pred = top_16

precision_at_5 = precision_at_k(y_true_sets, y_pred, 5)

# specified
print('Micro AUC:', micro_auc)
print('Macro AUC:', macro_auc)
print('Micro F1:', micro_f1)
print('Macro F1:', macro_f1)
print('Precision P@5:', precision_at_5)


In [None]:
top_16

## Baseline: GPT 3.5 eval (turbo)

In [None]:
# standard GPT-3.5/4

# for Azure
# openai.api_type = "azure"
# openai.api_key = "..."
# openai.api_base = "https://example-endpoint.openai.azure.com"
# openai.api_version = "2023-03-15-preview"

def call_model(prompt):
    completion = openai.ChatCompletion.create(
        model="gpt-3.5-turbo", 
        messages=[
            {"role": "user", "content": prompt}
        ],
        max_tokens=200,           # Maximum number of tokens in the generated text. If the generated response exceeds this limit, it will be cut off.
        temperature=1,          # Controls randomness. Higher values (closer to 1) make output more random, lower values (closer to 0) make it more deterministic.
        top_p=1,                  # Sets the nucleus sampling value, controls diversity via probability threshold, can be used as an alternative to temperature.
        frequency_penalty=0.0,    # Penalizes new tokens based on their frequency in the model's training data. Ranges from -2.0 to 2.0.
        presence_penalty=0.0,     # Penalizes new tokens based on whether they appear in the context. Ranges from -2.0 to 2.0.
        n=1,                      # The number of completions to generate. More completions means more diversity, but at a higher computational cost.
        stream=False,             # If true, generate the response as a stream to reduce latency.
        stop=None,                # A sequence (or list of sequences) where the API will stop generating further tokens.
    )
    return completion.choices[0].message.content

import openai
openai.api_key = ''

In [None]:
import json
with open('/Users/roshanswaroop/rema/rema/codemCodes.json', 'r') as f:
    # Load JSON data from file
    legit_codes = json.load(f)

In [None]:
prompt ='You are an experienced medical coder. You must identify all correct ICD-10 codes for the following clinical note. Pay attention to areas describing present illness, chart review, imaging, discharge labs, active issues, medications, chief complaint, major surgery/procedure, etc. Return your answer in the following format, but note that the actual correct codes may vary greatly from these: I10, E78.5, Z87.891\n'

print(prompt)

In [None]:
def get_code_descriptions(index):
    # Get the row corresponding to the index
    row = eval_df.loc[index]

    # Extract the text and ICD codes from the row
    text = row['text']
    icd_codes = row['icd_codes'].split()


    code_list = []
    description_list = []
    # For each ICD code, print the code and its description
    for code in icd_codes:
        description = icd_code_descriptions.get(code, "No description available")
        code_list.append(code)
        description_list.append(description)

    return code_list, description_list

def print_code_descriptions(index):
    # Get the row corresponding to the index
    row = eval_df.loc[index]

    # Extract the text and ICD codes from the row
    text = row['text']
    icd_codes = row['icd_codes'].split()

    #print("Text:", text, "\n")

    
    code_list = []
    description_list = []
    # For each ICD code, print the code and its description
    for code in icd_codes:
        description = icd_code_descriptions.get(code, "No description available")
        code_list.append(code)
        description_list.append(description)
        
    print("Code:", code_list)
    print("Description:", description_list)

In [None]:
import pandas as pd

# Empty dictionary to store the mapping
results = {'Original Codes': [], 'Predicted Codes': []}

for i in range(5):
    note = sampled_df[['text']].iloc[i][0]
    inference = call_model(prompt + note)

    # Append original codes and descriptions to the results
    original_codes, _ = get_code_descriptions(i)
    results['Original Codes'].append(original_codes)

    # Predicted codes are inferred from the model and need to be processed to match the format of original codes
    predicted_codes = inference.replace(",", " ").split()

    # Check each predicted code against the list of legit codes before appending
    # Removing periods from predicted codes only for lookup
    legit_predicted_codes = [code for code in predicted_codes if code.replace(".", "") in legit_codes]
    results['Predicted Codes'].append(legit_predicted_codes)

# Convert results to a DataFrame
results_df = pd.DataFrame(results)

In [None]:
results_df

In [None]:
# Get the simplified codes
original_codes_simplified = results_df['Original Codes'].apply(lambda codes: [code[:3] for code in codes])
predicted_codes_simplified = results_df['Predicted Codes'].apply(lambda codes: [code[:3] for code in codes])

# Create binary representations for each code
mlb = MultiLabelBinarizer()

# Get all unique codes in the dataset
all_codes = set()
for codes in original_codes_simplified:
    all_codes.update(codes)
for codes in predicted_codes_simplified:
    all_codes.update(codes)

#print(all_codes)
# Fit the binarizer on all unique codes
mlb.fit([all_codes])

# Now transform your labels
print(original_codes_simplified, predicted_codes_simplified)
y_true = []
for line in original_codes_simplified:
    y_true.append(line)
#print(y_true)
y_true = mlb.transform(y_true)
y_pred = mlb.transform(predicted_codes_simplified)

#print(y_true)


# Calculate metrics
micro_auc = roc_auc_score(y_true, y_pred, average='micro')
auc_scores = []
for class_index in range(y_true.shape[1]):
    try:
        class_auc = roc_auc_score(y_true[:, class_index], y_pred[:, class_index])
        auc_scores.append(class_auc)
    except ValueError:
        auc_scores.append(0.5)  # means it's as good as random for that class instance
macro_auc = np.mean(auc_scores)
micro_f1 = f1_score(y_true, y_pred, average='micro')
macro_f1 = f1_score(y_true, y_pred, average='macro')

# Convert y_true back to the set representation
y_true_sets = [set(codes) for codes in mlb.inverse_transform(y_true)]
y_pred_sets = [set(codes) for codes in mlb.inverse_transform(y_pred)]

# Calculate precision at 5
precision_at_5 = precision_at_k(y_true_sets, y_pred_sets, 5)

# Print the results
print('Micro AUC:', micro_auc)
print('Macro AUC:', macro_auc)
print('Micro F1:', micro_f1)
print('Macro F1:', macro_f1)
print('Precision P@5:', precision_at_5)

In [None]:
column_sums = np.sum(y_true, axis=0)
if np.any(column_sums == len(y_true)) or np.any(column_sums == 0):
    print('There is a class with only one type of instance.')
else:
    print('Every class has at least one positive and one negative instance.')

In [None]:
pd.set_option('display.max_colwidth', None)
pd.DataFrame(original_codes_simplified)

In [None]:
pd.DataFrame(predicted_codes_simplified)

In [None]:
# input an index, get MIMIC IV's code suggestions and the original clinical text
print_code_descriptions(0)

## GPT-4 zero-shot

In [None]:
import openai
openai.api_key = ''

def call_model(prompt):
    completion = openai.ChatCompletion.create(
        model="gpt-4", 
        messages=[
            {"role": "user", "content": prompt}
        ],
        max_tokens=200,           # Maximum number of tokens in the generated text. If the generated response exceeds this limit, it will be cut off.
        temperature=1,          # Controls randomness. Higher values (closer to 1) make output more random, lower values (closer to 0) make it more deterministic.
        top_p=1,                  # Sets the nucleus sampling value, controls diversity via probability threshold, can be used as an alternative to temperature.
        frequency_penalty=0.0,    # Penalizes new tokens based on their frequency in the model's training data. Ranges from -2.0 to 2.0.
        presence_penalty=0.0,     # Penalizes new tokens based on whether they appear in the context. Ranges from -2.0 to 2.0.
        n=1,                      # The number of completions to generate. More completions means more diversity, but at a higher computational cost.
        stream=False,             # If true, generate the response as a stream to reduce latency.
        stop=None,                # A sequence (or list of sequences) where the API will stop generating further tokens.
    )
    return completion.choices[0].message.content

prompt ='You are an experienced medical coder. You must identify all correct ICD-10 codes for the following clinical note. Pay attention to areas describing present illness, chart review, imaging, discharge labs, active issues, medications, chief complaint, major surgery/procedure, etc. Return your answer in the following format, but note that the actual correct codes may vary greatly from these: I10, E78.5, Z87.891\n'

print(prompt)

def get_code_descriptions(index):
    # Get the row corresponding to the index
    row = eval_df.loc[index]

    # Extract the text and ICD codes from the row
    text = row['text']
    icd_codes = row['icd_codes'].split()


    code_list = []
    description_list = []
    # For each ICD code, print the code and its description
    for code in icd_codes:
        description = icd_code_descriptions.get(code, "No description available")
        code_list.append(code)
        description_list.append(description)

    return code_list, description_list

import pandas as pd

# Empty dictionary to store the mapping
results = {'Original Codes': [], 'Predicted Codes': []}

for i in range(5):
    note = sampled_df[['text']].iloc[i][0]
    inference = call_model(prompt + note)

    # Append original codes and descriptions to the results
    original_codes, _ = get_code_descriptions(i)
    results['Original Codes'].append(original_codes)

    # Predicted codes are inferred from the model and need to be processed to match the format of original codes
    predicted_codes = inference.replace(",", " ").split()

    # Check each predicted code against the list of legit codes before appending
    # Removing periods from predicted codes only for lookup
    legit_predicted_codes = [code for code in predicted_codes if code.replace(".", "") in legit_codes]
    results['Predicted Codes'].append(legit_predicted_codes)

# Convert results to a DataFrame
results_df = pd.DataFrame(results)

In [None]:
results_df

In [None]:
 def evaluate(results_df):  
    # Get the simplified codes
    original_codes_simplified = results_df['Original Codes'].apply(lambda codes: [code[:3] for code in codes])
    predicted_codes_simplified = results_df['Predicted Codes'].apply(lambda codes: [code[:3] for code in codes])

    # Create binary representations for each code
    mlb = MultiLabelBinarizer()

    # Get all unique codes in the dataset
    all_codes = set()
    for codes in original_codes_simplified:
        all_codes.update(codes)
    for codes in predicted_codes_simplified:
        all_codes.update(codes)

    #print(all_codes)
    # Fit the binarizer on all unique codes
    mlb.fit([all_codes])

    # Now transform your labels
    #print(original_codes_simplified, predicted_codes_simplified)
    y_true = []
    for line in original_codes_simplified:
        y_true.append(line)
    #print(y_true)
    y_true = mlb.transform(y_true)
    y_pred = mlb.transform(predicted_codes_simplified)

    #print(y_true)


    # Calculate metrics
    micro_auc = roc_auc_score(y_true, y_pred, average='micro')
    auc_scores = []
    for class_index in range(y_true.shape[1]):
        try:
            class_auc = roc_auc_score(y_true[:, class_index], y_pred[:, class_index])
            auc_scores.append(class_auc)
        except ValueError:
            auc_scores.append(0.5)  # means it's as good as random for that class instance
    macro_auc = np.mean(auc_scores)
    micro_f1 = f1_score(y_true, y_pred, average='micro')
    macro_f1 = f1_score(y_true, y_pred, average='macro')

    # Convert y_true back to the set representation
    y_true_sets = [set(codes) for codes in mlb.inverse_transform(y_true)]
    y_pred_sets = [set(codes) for codes in mlb.inverse_transform(y_pred)]

    # Calculate precision at 5
    precision_at_5 = precision_at_k(y_true_sets, y_pred_sets, 5)

    # Print the results
    print('Micro AUC:', micro_auc)
    print('Macro AUC:', macro_auc)
    print('Micro F1:', micro_f1)
    print('Macro F1:', macro_f1)
    print('Precision P@5:', precision_at_5)

In [None]:
def print_code_descriptions(results_df, index):
    # Get the row corresponding to the index
    row = results_df.loc[index]

    # Extract the original and predicted ICD codes from the row
    original_codes = row['Original Codes']
    predicted_codes = row['Predicted Codes']

    # For each ICD code in original and predicted codes, print the code and its description
    for code_list, label in zip([original_codes, predicted_codes], ['Original', 'Predicted']):
        print(f"{label} Codes and Descriptions:")
        for code in code_list:
            description = icd_code_descriptions.get(code.replace(".", ""), "No description available")
            print("Code:", code)
            print("Description:", description)
        print("\n")

print_code_descriptions(results_df, 1)

In [None]:
evaluate(results_df)

## GPT 4 arbitrary few-shot

In [None]:
few_shot = 'You are an experienced medical coder. You must identify all correct ICD-10 codes for the following clinical note. Pay attention to areas describing present illness, chart review, imaging, discharge labs, active issues, medications, chief complaint, major surgery/procedure. Return your answer in the following format, but note that the actual correct codes may vary greatly from these: I10, E78.5, Z87.891. Here are a few examples: \n Patient admits to a history of alcohol dependence. Consuming 5 – 6 beers per day now, down from 10 – 12 per day 6 months ago. States that he has nausea and sweating with “the shakes” when he does not drink. \n R10.819 \n Patient stopped taking olmesartan medoxomil due to side effects, including a headache that began after starting the medication and still exists, and tiredness. \n T46.5X6A, Z91.128 \n  38 year old established female seen one week ago for decreased exercise tolerance and general malaise over the past four weeks when doing her daily aerobics class. Labs were ordered on that visit. She presents today with pale skin, weakness, and epigastric pain; symptoms are unchanged since previous visit. Laboratory studies reviewed today are as follows: HGB 8.5 gm/dL, HCT 27%, platelets 300,000/mm3, reticulocytes 0.24%, MCV 75, serum iron 41 mcg/dL, serum ferritin 9 ng/ml, TIBC 457 mcg/dL; Fecal occult blood test is positive. She takes Esomeprazole daily for GERD with esophagitis and reports taking OTC antacids at bedtime for epigastric pain for the past three months. She also uses ibuprofen as needed for headaches. \n D50.0, K21.0'

print(few_shot)

In [None]:
results2 = {'Original Codes': [], 'Predicted Codes': []}


for i in range(5):
    note = sampled_df[['text']].iloc[i][0]
    inference = call_model(few_shot + note)

    # Append original codes and descriptions to the results
    original_codes, _ = get_code_descriptions(i)
    results2['Original Codes'].append(original_codes)

    # Predicted codes are inferred from the model and need to be processed to match the format of original codes
    predicted_codes = inference.replace(",", " ").split()

    # Check each predicted code against the list of legit codes before appending
    # Removing periods from predicted codes only for lookup
    legit_predicted_codes = [code for code in predicted_codes if code.replace(".", "") in legit_codes]
    results2['Predicted Codes'].append(legit_predicted_codes)

# Convert results to a DataFrame
results_df2 = pd.DataFrame(results)

In [None]:
evaluate(results_df2.tail(5))

In [None]:
results_df2.tail(5)

In [None]:
data = [
    (['G3183', 'F0280', 'R441', 'R296', 'E785', 'Z8546'], ['I25.10', 'N18.9', 'E78.5', 'I10', 'K57.90', 'K21.9', 'E66.01', 'M19.90', 'C61', 'H25.9', 'Z48.813', 'Z95.1', 'Z87.891']),
    (['C675', 'I10', 'D259', 'Z87891', 'E785', 'E890'], ['I49.8', 'I48.91', 'I25.10', 'I25.5', 'I47.2', 'G47.30']),
    (['J441', 'N179', 'Z9981', 'I4891', 'D649', 'I10', 'E785', 'G5622', 'I2510', 'M1990', 'Z96649', 'Z87891', 'J45909', 'F419', 'G4700', 'R040', 'I739'], ['D62', 'D50.8', 'I73.9']),
    (['K31811', 'B1910', 'S0990XA', 'G629', 'D62', 'F1120', 'I452', 'I6523', 'G40909', 'I951', 'F319', 'Q2733', 'I10', 'W01198A', 'Y92008', 'I701', 'M5416', 'E039', 'E785', 'J449', 'K219', 'Z86718', 'Z87891', 'K2270', 'R110', 'T402X5A', 'Y929', 'I739', 'I69398', 'R531', 'R42', 'N3090', 'R079', 'I459', 'K5900'], ['O80', 'Z37.0', 'O82', 'Z3A.37', 'N47.0']),
    (['T8453XA', 'D62', 'N179', 'D709', 'B9562', 'D696', 'I10', 'E785', 'I2510', 'E860', 'H409', 'B9689', 'N400', 'Z951', 'Z8673', 'Z96652', 'Z954', 'Y792', 'Y929'], ['O09.212', 'Z33.1', 'J45.909', 'Z87.01', 'A54.9', 'O60.00'])
]

percent_diffs = []
for left, right in data:
    left_count = len(left)
    right_count = len(right)
    percent_diff = ((right_count - left_count) / left_count) * 100
    percent_diffs.append(percent_diff)

avg_percent_diff = sum(percent_diffs) / len(percent_diffs)

# Print the average percentage difference
print(f"Average Percentage Difference: {avg_percent_diff:.2f}%")


## GPT3.5 to structure/clean data -> GPT-4 inference

In [None]:
structure_prompt = 'You are a medical coder. Return only clinically important text from this documentation that may influence ICD-10 coding. Pay special attention to areas describing present illness, chart review, imaging, discharge labs, active issues, medications, chief complaint, major surgery/procedure, etc.:'
prompt ='You are an experienced medical coder. You must identify all correct ICD-10 codes for the following clinical note. Pay attention to areas describing present illness, chart review, imaging, discharge labs, active issues, medications, chief complaint, major surgery/procedure, etc. Return your answer in the following format, but note that the actual correct codes may vary greatly from these: I10, E78.5, Z87.891\n'

print(prompt)
print(structure_prompt)

In [None]:
import openai
openai.api_key = ''

def call_turbo(prompt):
    completion = openai.ChatCompletion.create(
        model="gpt-3.5-turbo", 
        messages=[
            {"role": "user", "content": prompt}
        ],
        max_tokens=600,           # Maximum number of tokens in the generated text. If the generated response exceeds this limit, it will be cut off.
        temperature=1,          # Controls randomness. Higher values (closer to 1) make output more random, lower values (closer to 0) make it more deterministic.
        top_p=1,                  # Sets the nucleus sampling value, controls diversity via probability threshold, can be used as an alternative to temperature.
        frequency_penalty=0.0,    # Penalizes new tokens based on their frequency in the model's training data. Ranges from -2.0 to 2.0.
        presence_penalty=0.0,     # Penalizes new tokens based on whether they appear in the context. Ranges from -2.0 to 2.0.
        n=1,                      # The number of completions to generate. More completions means more diversity, but at a higher computational cost.
        stream=False,             # If true, generate the response as a stream to reduce latency.
        stop=None,                # A sequence (or list of sequences) where the API will stop generating further tokens.
    )
    return completion.choices[0].message.content


# Empty dictionary to store the mapping
results3 = {'Original Codes': [], 'Predicted Codes': []}

note = sampled_df[['text']].iloc[i][0]
processed_note = call_turbo(structure_prompt + note)

for i in range(5):
    note = sampled_df[['text']].iloc[i][0]
    processed_note = call_turbo(structure_prompt + note)
    inference = call_model(prompt + processed_note)
    
    # Append original codes and descriptions to the results
    original_codes, _ = get_code_descriptions(i)
    results3['Original Codes'].append(original_codes)

    # Predicted codes are inferred from the model and need to be processed to match the format of original codes
    predicted_codes = inference.replace(",", " ").split()

    # Check each predicted code against the list of legit codes before appending
    # Removing periods from predicted codes only for lookup
    legit_predicted_codes = [code for code in predicted_codes if code.replace(".", "") in legit_codes]
    results3['Predicted Codes'].append(legit_predicted_codes)

#Convert results to a DataFrame
results_df3 = pd.DataFrame(results3)
evaluate(results_df3)


In [None]:
results3["Percentage Difference"] = ((results_df2["Predicted Codes"].apply(len) - results_df3["Original Codes"].apply(len)) / results_df3["Original Codes"].apply(len)) * 100

In [None]:
results_df3["Original Codes Count"] = results_df3["Original Codes"].apply(len)
results_df3["Predicted Codes Count"] = results_df3["Predicted Codes"].apply(len)

# Calculate the percentage difference
results_df3["Percentage Difference"] = ((results_df3["Predicted Codes Count"] - results_df3["Original Codes Count"]) / results_df3["Original Codes Count"]) * 100

# Calculate the average percentage difference
avg_percent_diff = results_df3["Percentage Difference"].mean()

# Print the average percentage difference
print(f"Average Percentage Difference: {avg_percent_diff:.2f}%")

## DSP, KNN + GPT-4

In [None]:
import dsp

os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"

locale.getpreferredencoding = getpreferredencoding
pip install faiss-cpu
pip install sentence-transformers

dsp.settings.configure(vectorizer=dsp.SentenceTransformersVectorizer())
knn_func = dsp.knn(squad_train)
lm = dsp.GPT4(model='gpt-4', api_key=openai_key)

@dsp.transformation
def inf(example, k=3):
    knn_res_train_vec = knn_func(example, k) # get k demos
    example.demos = knn_res_train_vec
    example.context = dsp.retrieve(example.question, k=2)
    example, completions = dsp.generate(qa_template_with_passages, temperature=0.6)(example, stage='qa')
    return completions