In [None]:
import json
import re
import pandas as pd

In [None]:
with open("translated_transcripts.json", 'r') as file:
    data = json.load(file)

special_char_pattern = re.compile(r'[^a-zA-Z0-9\s]')

special_characters = set()
for transcript in data.values():
    matches = special_char_pattern.findall(transcript)
    special_characters.update(matches)

def has_error(word):
    if any(c.isdigit() for c in word) and any(c.isalpha() for c in word):
        return True
    if any(c in special_characters for c in word):
        return True
    return False

def contains_errors_based_on_criteria(transcript):
    words = re.findall(r'\b\w+\b', transcript)
    return any(has_error(word) for word in words)

In [None]:
transcript_ids_with_errors = [tid for tid, transcript in data.items() if contains_errors_based_on_criteria(transcript)]

In [None]:
len(transcript_ids_with_errors)

In [None]:
transcript_ids_with_errors

In [None]:
ids = []
transcripts = []
for i in data:
    ids.append(i)
    transcripts.append(data[i])

In [None]:
df = pd.DataFrame({"IDs" : ids, "Transcripts" : transcripts})
df = df[df["IDs"].isin(transcript_ids_with_errors)].reset_index().drop("index", axis = 1)

def no_n(x):
    x = re.sub(r'\n', ' ', x)
    return x

In [None]:
df["Transcripts"] = df["Transcripts"].apply(no_n)

In [None]:
df_1 = df.iloc[0:18]

In [None]:
t_608 = "D: Hi Mr. Adair, I understand that you have been diagnosed with hepatitis A. How have you been feeling lately? P: Well, Doc, I've been feeling pretty bad - like, real bad. I've got nausea all the time, and my stomach hurts. I've also been having diarrhea, and my joints ache, you know? D: I see. It's not surprising that you're feeling this way given your condition. Hepatitis A is a viral infection that affects the liver and can cause a range of symptoms, including the ones you mentioned. P: Is there anything I can do to make myself feel better? D: First and foremost, you should consult the nearest hospital and follow their advice. In addition, make sure to wash your hands thoroughly with soap and water, avoid fatty and spicy foods, and take medication as prescribed by the hospital. P: Okay, I'll do that. Thank you for your help, Doctor. D: You're welcome, Mr. Adair. It's important that you take care of yourself and follow the doctor's advice. Please come back if you have any further concerns."

In [None]:
t_2287 = "During the visit, I assessed Jackie Phillips, a 74-year-old patient, who presented with symptoms of hepatitis E, including joint pain, abdominal pain, and high fever. After evaluating her condition, I determined that she had hepatitis E. I advised her to stop consuming alcohol, rest, and take no medication. Since her symptoms were not severe, I did not prescribe any medication. I recommended that she follow up with her primary care physician for further management of her condition."

In [None]:
t_1766 =  "D:Good morning, Cynthia. How can I help you today? P:Hi, Doctor. I've been experiencing dizziness, back pain, and neck pain recently. D:I see, Cynthia. Based on your symptoms and your age, I believe you may have cervical spondylosis. P:What is that, Doctor? D:Cervical spondylosis is a condition that affects the neck, especially the discs and joints in the neck. It's common in older adults and can cause pain and stiffness. P:Oh, I see. Is there anything I can do to feel better? D:Yes, there are some precautions you can take. You can use a heating pad or cold pack to relieve the pain, exercise to keep your neck flexible, and take over-the-counter pain relievers like ibuprofen. P:Ok, I will try those. Thank you, Doctor. D:You're welcome, Cynthia. Let's schedule a follow-up appointment in a few weeks to see how you're doing."

In [None]:
t_4381 = "During the visit, I, the doctor, evaluated Lucy Houser's symptoms and medical history and determined that she has arthritis. Lucy complained of stiffness in her movement, muscle weakness, and a stiff neck. I advised Lucy on some precautions to help manage her arthritis symptoms. These precautions included engaging in regular exercise, using hot and cold therapy, trying acupuncture, and massage. As her symptoms were mainly due to the arthritis, I did not prescribe any medication. Instead, I recommended that Lucy continues with her regular medication for arthritis and follows up with me for a check-up in two weeks."

In [None]:
t_4623 = "During today's visit, I, the doctor, diagnosed Mr. Kyle Acuff, age 60, with hypothyroidism. The patient was experiencing symptoms such as fatigue, weight gain, and brittle nails. I advised Mr. Acuff to reduce stress, exercise regularly, eat healthily, and get proper sleep. I also prescribed him the medication Tirosint-Sol to manage his condition. Overall, I emphasized the importance of lifestyle changes and medication adherence for his recovery."

In [None]:
t_3288 = "D: Hello David, I'm the doctor. You have been diagnosed with gastroenteritis. P: Oh, okay. I've been feeling really sick with vomiting, sunken eyes, and diarrhea. D: That sounds like gastroenteritis. It's common and usually goes away on its own. P: Is there anything I can do to feel better? D: Yes, I would advise you to stop eating solid food for a while, try taking small sips of water, rest, and ease back into eating when you feel better. P: Ok, thanks. D: No problem. If your symptoms worsen or don't improve within a week, please come back for a follow-up appointment. In the meantime, make sure to stay hydrated and get plenty of rest. P: Thank you, doctor. I will do that."

In [None]:
#처음 6개
df_1[df_1["IDs"] == "608"]["Transcripts"] = t_608
df_1[df_1["IDs"] == "2287"]["Transcripts"] = t_2287
df_1[df_1["IDs"] == "1766"]["Transcripts"] = t_1766
df_1[df_1["IDs"] == "4381"]["Transcripts"] = t_4381
df_1[df_1["IDs"] == "4623"]["Transcripts"] = t_4623
df_1[df_1["IDs"] == "3288"]["Transcripts"] = t_3288

In [None]:
df_1.to_csv("clean1_csv.csv", index = False)

In [None]:
#pip install openai

#OpenAI Api Key

In [None]:
import openai

In [None]:
openai.api_key = ""

In [None]:
def get_gpt_response(trans):
    prompt = f'''{trans}.
    Clean this text.'''
    response = openai.Completion.create(
        engine = "text-davinci-002",
        prompt = prompt,
        temperature = 0.2,
        max_tokens = 200)
    
    answer = response.choices
    return response.choices[0].text

In [None]:
def check_prompt(trans):
    prompt = f'''Correct the transcript. Do not add content that is not there originally: {trans}'''
    return prompt

In [None]:
clean1 = pd.read_csv("clean1_csv.csv")
clean2 = pd.read_csv("cleaned.csv", header = None)
clean3 = pd.read_csv("cleaned_3.csv", header = None)

In [None]:
clean1["IDs"] = clean1["IDs"].apply(str)
clean1.columns = ["IDs", "Transcripts"]
clean2.columns = ["IDs", "Transcripts"]
clean3.columns = ["IDs", "Transcripts"]

In [None]:
pd.concat([clean1, clean2, clean3]).to_csv("Cleaned_54.csv", index = False)