# Processing Transcription Data

## Setup

### Installing Packages

In [0]:
!pip install langchain==0.0.344
!pip install openai==1.3.6
%pip install --force-reinstall typing-extensions==4.8.0
%pip install tiktoken
#!pip install langchain-community
#!pip install langchain-openai

[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m
Collecting langchain==0.0.344
  Downloading langchain-0.0.344-py3-none-any.whl (1.9 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.9/1.9 MB 20.4 MB/s eta 0:00:00
Installing collected packages: langchain
  Attempting uninstall: langchain
    Found existing installation: langchain 0.0.348
    Not uninstalling langchain at /databricks/python3/lib/python3.10/site-packages, outside environment /local_disk0/.ephemeral_nfs/envs/pythonEnv-dd3d826e-f34a-4a17-b9fc-86179a047023
    Can't uninstall 'langchain'. No files were found to uninstall.
Successfully installed langchain-0.0.344
[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m
[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m
Collecting openai==1.3.6
  Downloading openai-1.3.6-py3-none-any.whl (220 kB

In [0]:
dbutils.library.restartPython()

### Importing Packages

In [0]:
### OpenAI
#from langchain.chat_models import AzureChatOpenAI
from langchain.chat_models.azure_openai import AzureChatOpenAI
#from langchain_community.chat_models import ChatOpenAI
from langchain.llms import AzureOpenAI
from langchain.llms import OpenAI

### Callbacks
from langchain.callbacks import get_openai_callback

### Text Splitter
from langchain.text_splitter import RecursiveCharacterTextSplitter

### Prompt Templates
from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
from langchain.schema.messages import SystemMessage

### Python
import pandas as pd
import time
import os
import re
import datetime

### Setting Environment Variables 

In [0]:
os.environ['OPENAI_EMBEDDING_DEPLOYMENT_NAME'] = "t-e-ada-002"
os.environ['OPENAI_EMBEDDING_MODEL_NAME'] = "text-embedding-ada-002"
os.environ['OPENAI_API_BASE'] = "https://gsk-ds.openai.azure.com/"
os.environ['OPENAI_API_KEY'] = "9170cb42baef469cb67c65aa12b79085"
os.environ['OPENAI_API_TYPE'] = "Azure"
os.environ['OPENAI_API_VERSION'] = "2023-03-15-preview"
os.environ['OPENAI_DEPLOYMENT_NAME'] = "ds-gpt-4"
os.environ['OPENAI_MODEL_NAME'] = "gpt-4"
os.environ['OPENAI_DEPLOYMENT_NAME_3_5'] = "ds-gpt-35"
os.environ['OPENAI_MODEL_NAME_3_5'] = "gpt-35-turbo"

## Instantiating Generative AI Components

### Instantiating LLM's

In [0]:
# Azure Chat LLM
chat_llm = AzureChatOpenAI(
    deployment_name=os.getenv("OPENAI_DEPLOYMENT_NAME"),
    openai_api_version=os.getenv("OPENAI_API_VERSION"),
    openai_api_type="azure",
    openai_api_base=os.getenv("OPENAI_API_BASE"),
    openai_api_key=os.getenv("OPENAI_API_KEY"),
    temperature=0,
    verbose=True,
)

# Azure LLM
llm = AzureOpenAI(
    deployment_name=os.getenv("OPENAI_DEPLOYMENT_NAME"),
    model_name=os.getenv("OPENAI_MODEL_NAME"),
    temperature=0,
)



### Creating Text Splitter

In [0]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=4000,
    chunk_overlap=0,
)

## Functions

### Extraction Functions

In [0]:
def extract_data(transcript, pattern, keys):
    results = []
    matches = re.findall(pattern, transcript)

    for match in matches:
        results.append(dict(zip(keys, match)))

    return results
  

def extract_timestamp_speaker_text(transcript):

  pattern = r"\((\d{2}:\d{2}:\d{2}\.\d{2,3}) - (\d{2}:\d{2}:\d{2}\.\d{2,3})\) Speaker \[([^\]]+)\] : (.+)"
  keys = ['start_time', 'end_time', 'speaker', 'text']

  return extract_data(transcript, pattern, keys)


def extract_index_speaker(transcript):

  pattern = r"\((\d+)\) \[(.+)\]: (.+)"
  keys = ['index', 'speaker', 'text']

  return extract_data(transcript, pattern, keys)


def extract_index_timestamp(transcript):

  pattern = r"\((\d+)\) \((\d{2}:\d{2}:\d{2}\.\d{2,3}) - (\d{2}:\d{2}:\d{2}\.\d{2,3})\)"
  keys = ['index', 'start_time', 'end_time']
  
  return extract_data(transcript, pattern, keys)

### Transcript Processing

In [0]:
def split_timestamps_and_transcript(input_relative_path, transcript_relative_path, timestamp_relative_path):
    for file_name in sorted(os.listdir(input_relative_path)):
        if any((match := x) in file_name for x in CALL_RECORDINGS):
            name = os.path.splitext(file_name)[0]
            with open(f"{input_relative_path}/{file_name}", "r") as f:
                transcript = f.read()

                sections = extract_timestamp_speaker_text(transcript)
                reordered_transcript = [
                    f"({i}) [{section['speaker']}]: {section['text']}"
                    for i, section in enumerate(sections)
                ]
                timestamps = [
                    f"({i}) ({section['start_time']} - {section['end_time']})"
                    for i, section in enumerate(sections)
                ]

                with open(f"{transcript_relative_path}/{name}_cut.txt", "w") as f:
                    f.write("\n".join(reordered_transcript))

                with open(f"{timestamp_relative_path}/{name}_timestamps.txt", "w") as f:
                    f.write("\n".join(timestamps))

### LLM Processing

In [0]:
def llm_data_process(input_relative_path, output_relative_path, prompt_template, output_file_suffix, extra_info=None):
    for file_name in sorted(os.listdir(input_relative_path)):
        if any((match := x) in file_name for x in CALL_RECORDINGS):
            start_time = time.time()
            with open(f"{input_relative_path}/{file_name}", "r") as f:
                transcript = f.read()

            name = os.path.splitext(file_name)[0]
            print(f"Processing {name}")

            chunked_transcript = text_splitter.create_documents([transcript])
            output_list = []
            total_tokens = 0
            prompt_tokens = 0
            completion_tokens = 0
            successful_requests = 0
            total_cost = 0
            for chunk in chunked_transcript:
                with get_openai_callback() as cb:
                    if extra_info:
                        output = chat_llm(prompt_template.format_messages(text=chunk.page_content, **extra_info))
                    else:
                        output = chat_llm(prompt_template.format_messages(text=chunk.page_content))
                    output_list.append(output.content)

                    # Costs and Tokens
                    total_tokens += cb.total_tokens
                    prompt_tokens += cb.prompt_tokens
                    completion_tokens += cb.completion_tokens
                    successful_requests += cb.successful_requests
                    total_cost += cb.total_cost

            print(
                f"Tokens Used: {total_tokens}, Prompt Tokens: {prompt_tokens}, Completion Tokens: {completion_tokens}, Successful Requests: {successful_requests} \nTotal Cost (USD): ${round(total_cost, 2)}"
            )
            print(f"Time Taken: {round(time.time() - start_time, 2)} seconds")

            with open(f"{output_relative_path}/{match}_{output_file_suffix}.txt", "w") as f:
                f.write("\n".join(output_list))

### Combining Functions

In [0]:
def combine_timestamp_and_transcript(timestamp_relative_path, transcript_relative_path, text_relative_path, csv_relative_path):
    for file_name in sorted(os.listdir(timestamp_relative_path)):
        if any((match := x) in file_name for x in CALL_RECORDINGS):
            start_time = time.time()

            with open(f"{timestamp_relative_path}/{match}_timestamps.txt", "r") as f:
                timestamp = f.read()
                timestamp_dict = extract_index_timestamp(timestamp)


            with open(f"{transcript_relative_path}/{match}_speaker_assigned.txt", "r") as f:
                transcript = f.read()
                transcript_dict = extract_index_speaker(transcript)

            name = os.path.splitext(file_name)[0]
            print(f"Processing {name}")

            merged_data = {}
            for timestamp_dict, transcript_dict in zip(timestamp_dict, transcript_dict):
                index = timestamp_dict["index"]
                merged_data[index] = {
                    "start_time": timestamp_dict["start_time"],
                    "end_time": timestamp_dict["end_time"],
                    "speaker": transcript_dict["speaker"],
                    "text": transcript_dict["text"],
                }

            transcript_df = pd.DataFrame(merged_data).T
            transcript_df.to_csv(f"{csv_relative_path}/{match}_sov.csv", index=False)

            result = [
                f"({data['start_time']} - {data['end_time']}) Speaker [{data['speaker']}] : {data['text']}"
                for data in merged_data.values()
            ]

            result_string = "\n".join(result)

            with open(f"{text_relative_path}/{match}_combined.txt", "w") as f:
                f.write(result_string)

            print(f"Time Taken: {round(time.time() - start_time, 2)} seconds")

In [0]:
def combine_lines_with_same_speaker(text):
    lines = text.split("\n")
    combined_lines = []
    current_speaker = None
    combined_line = None

    for line in lines:
        # Use regular expressions to extract the speaker and the text content
        match = re.match(r"\((\d+)\)\s+\[(\w+)\]:\s(.+)", line)
        if match:
            line_index, speaker, text_content = match.groups()
            line_index = int(line_index)

            if speaker == current_speaker:
                # If the speaker is the same as the previous line, join the text
                combined_line[0].append(line_index)
                combined_line[2] += " " + text_content
            else:
                if combined_line:
                    combined_lines.append(combined_line)
                combined_line = [[line_index], speaker, text_content]
                current_speaker = speaker
        else:
            # If the line doesn't match the expected format, add it as-is
            if combined_line:
                combined_lines.append(combined_line)
                combined_line = None
            combined_lines.append([line])

    # Handle the last combined line if it exists
    if combined_line:
        combined_lines.append(combined_line)

    output_lines = []
    for line in combined_lines:
        indexes = ", ".join(map(str, line[0]))
        if len(line) == 3:
            output_lines.append(f"({indexes}) [{line[1]}]: {line[2]}")
        else:
            output_lines.extend([f"({index})" for index in line[0]])

    return "\n".join(output_lines)

In [0]:
def join_sections(input_relative_path, output_relative_path):
    for file_name in sorted(os.listdir(input_relative_path)):
        if any((match := x) in file_name for x in CALL_RECORDINGS):
            start_time = time.time()
            with open(f"{input_relative_path}/{file_name}", "r") as f:
                transcript = f.read()

            name = os.path.splitext(file_name)[0]
            print(f"Processing {name}")

            output_text = combine_lines_with_same_speaker(transcript)
            print(f"Time Taken: {round(time.time() - start_time, 2)} seconds")

            with open(f"{output_relative_path}/{match}_sectioned.txt", "w") as f:
                f.write(output_text)

In [0]:
def order_by_suffix(df, column):
    return (
        df.assign(Index=df[column].str.split("_").str[-1].astype(int))
        .sort_values(by=["Index"])
        .drop(columns=["Index"])
        .reset_index(drop=True)
    )

## Defining Chat Prompts

### Transcript Cleaning

In [0]:
pii_cleanup = ChatPromptTemplate.from_messages(
    [
        SystemMessage(
            content=(
                """
                You are a helpful assistant that is an expert at removing PII from transcripts.You will remove PII such as:Names,Addresses,Phone Numbers,Email Addresses,Dates.Do not remove PII such as:Product Names,Company Names,Surgery Names.Use the information given in the conversation to remove these PII mentions and replace them with **PII**.IMPORTANT : Return the EXACT transcript as input with the PII mentions removed and replaced with **PII**
                """
            )
        ),
        HumanMessagePromptTemplate.from_template("{text}"),
    ]
)


medical_word_cleanup = ChatPromptTemplate.from_messages(
    [
        SystemMessage(
            content=(
                """
                You are a helpful assistant that is an expert at correcting transcripts with misrepresented words
                Look at the conversation below it talks about {product}
                The transcription is sometimes wrong and misrepresents medical words relating to {product} or its competitors
                Use the information given in the conversation to correct these errors
                IMPORTANT : Return the EXACT text as input  with the medical words substituted where they have been misrepresented
                """
            )
        ),
        HumanMessagePromptTemplate.from_template("{text}"),
    ]
)


find_speaker_template = ChatPromptTemplate.from_messages(
    [
        SystemMessage(
            content=(
                """
                You are a helpful assistant that helps label speakers in a conversation.
                Look at the conversation below and label the speakers.
                The three label types are Surveyor ,Health Care Professional and Patient, use the following format to label the speakers [Surveyor],[HCP] or [Patient]
                Use the information given in the conversation to label the speakers.
                IMPORTANT : Return the EXACT transcript as input  with the speakers labeled where it currently has square brackets.
                FORMAT : (INDEX) [SPEAKER]: TEXT
                """
            )
        ),
        HumanMessagePromptTemplate.from_template("{text}"),
    ]
)

##Processing

In [0]:
base_path = "/FileStore/Viiv_audio"
folder_path = "test"
CALL_RECORDINGS = []
for file in dbutils.fs.ls(base_path):
  if file.isDir():
    continue
  CALL_RECORDINGS.append(os.path.splitext(file.name)[0])
  print(file.name)

09_27_2023_PA_PrEP_Consumer_Typing_Research_HCP_2.mp4
09_27_2023_PCP_PrEP_Consumer_Typing_Research_HCP_1.mp4
09_27_2023_PCP_PrEP_Consumer_Typing_Research_HCP_3.mp4
09_29_2023_PA_PrEP_Consumer_Typing_Research_HCP__4.m4a
09_29_2023_PA_PrEP_Consumer_Typing_Research_HCP__5.m4a
10_02_2023_ObGyn_PrEP_Consumer_Typing_Research_HCP__7.m4a
10_02_2023_ObGyn_PrEP_Consumer_Typing_Research__9.m4a
10_02_2023_PA_PrEP_Consumer_Typing_Research_HCP__8.m4a
10_02_2023_PCP_PrEP_Consumer_Typing_Research_HCP__10.m4a
10_03_2023_ObGyn_PrEP_Consumer_Typing_Research__12.m4a
10_03_2023_PCP_PrEP_Consumer_Typing_Research__13.m4a
10_04_2023_NP_PrEP_Consumer_Typing_Research__15.m4a
10_04_2023_ObGyn_PrEP_Consumer_Typing_Research__14.m4a
10_05_2023_NP_PrEP_Consumer_Typing_Research__17.m4a
10_05_2023_PCP_PrEP_Consumer_Typing_Research__16.m4a
Vaporetto_E_US_NAVIGATOR_5_Rosita_17Nov_5pmET_PatientAdvocate.wav
Vaporetto_E_US_PAT11_Rosita_8Nov_12pmET_Patient.wav
Vaporetto_E_US_PAT17_Rosita_13Nov_11amET_Patient.wav
Vaporetto_E

### Splitting Timestamps and Transcript

In [0]:
base_path="/dbfs/FileStore/Viiv_audio"
input_relative_path = f"{base_path}/00_transcripts/{folder_path}"
transcript_relative_path = f"{base_path}/01_cut/{folder_path}"
timestamp_relative_path = f"{base_path}/01_timestamps/{folder_path}"

split_timestamps_and_transcript(input_relative_path, transcript_relative_path, timestamp_relative_path)

### Removing PII

In [0]:
input_relative_path = f"{base_path}/01_cut/{folder_path}"
output_relative_path = f"{base_path}/02_pii/{folder_path}"

llm_data_process(input_relative_path, output_relative_path, pii_cleanup, "pii_removed")

Processing Vaporetto_E_US_PAT11_Rosita_8Nov_12pmET_Patient_transcript_cut


com.databricks.backend.common.rpc.CommandCancelledException
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3(SequenceExecutionState.scala:103)
	at scala.Option.getOrElse(Option.scala:189)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$2(SequenceExecutionState.scala:103)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$2$adapted(SequenceExecutionState.scala:100)
	at scala.collection.immutable.Range.foreach(Range.scala:158)
	at com.databricks.spark.chauffeur.SequenceExecutionState.cancel(SequenceExecutionState.scala:100)
	at com.databricks.spark.chauffeur.ExecContextState.cancelRunningSequence(ExecContextState.scala:714)
	at com.databricks.spark.chauffeur.ExecContextState.$anonfun$cancel$1(ExecContextState.scala:430)
	at scala.Option.getOrElse(Option.scala:189)
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:430)
	at com.databricks.spark.chauffeur.ChauffeurState.cancelExecutio

### Correcting Misrepresented Medical Words

In [0]:
input_relative_path = f"{base_path}/02_pii/{folder_path}"
output_relative_path = f"{base_path}/03_medical/{folder_path}"

llm_data_process(input_relative_path, output_relative_path, medical_word_cleanup, "medically_clean", {"product":"Apretude"})

com.databricks.backend.common.rpc.CommandCancelledException
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:429)
	at com.databricks.spark.chauffeur.ChauffeurState.cancelExecution(ChauffeurState.scala:1225)
	at com.databricks.spark.chauffeur.ChauffeurState.$anonfun$process$1(ChauffeurState.scala:958)
	at com.databricks.logging.UsageLogging.$anonfun$recordOperation$1(UsageLogging.scala:573)
	at com.databricks.logging.UsageLogging.executeThunkAndCaptureResultTags$1(UsageLogging.scala:669)
	at com.databricks.logging.UsageLogging.$anonfun$recordOperationWithResultTags$4(UsageLogging.scala:687)
	at com.databricks.logging.UsageLogging.$anonfun$withAttributionContext$1(UsageLogging.scala:426)
	at scala.util.DynamicVariable.withValue(DynamicVariable.scala:62)
	at com.databricks.logging.AttributionContext$.withValue(AttributionContext.scala:216)
	at com.databricks.logging.UsageLogging.withAttributionContext(UsageLogging.scala:424)
	at com.databricks.logging.Usag

### Identifying Speaker

In [0]:
input_relative_path = f"{base_path}/03_medical/{folder_path}"
output_relative_path = f"{base_path}/04_speaker/{folder_path}"

llm_data_process(input_relative_path, output_relative_path, find_speaker_template, "speaker_assigned")

com.databricks.backend.common.rpc.CommandCancelledException
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:429)
	at com.databricks.spark.chauffeur.ChauffeurState.cancelExecution(ChauffeurState.scala:1225)
	at com.databricks.spark.chauffeur.ChauffeurState.$anonfun$process$1(ChauffeurState.scala:958)
	at com.databricks.logging.UsageLogging.$anonfun$recordOperation$1(UsageLogging.scala:573)
	at com.databricks.logging.UsageLogging.executeThunkAndCaptureResultTags$1(UsageLogging.scala:669)
	at com.databricks.logging.UsageLogging.$anonfun$recordOperationWithResultTags$4(UsageLogging.scala:687)
	at com.databricks.logging.UsageLogging.$anonfun$withAttributionContext$1(UsageLogging.scala:426)
	at scala.util.DynamicVariable.withValue(DynamicVariable.scala:62)
	at com.databricks.logging.AttributionContext$.withValue(AttributionContext.scala:216)
	at com.databricks.logging.UsageLogging.withAttributionContext(UsageLogging.scala:424)
	at com.databricks.logging.Usag

### Recombining Timestamps and Transcript

In [0]:
transcript_relative_path = f"{base_path}/04_speaker/{folder_path}"
timestamp_relative_path = f"{base_path}/01_timestamps/{folder_path}"
text_relative_path = f"{base_path}/05_combined/{folder_path}"
csv_relative_path = f"{base_path}/05_share_of_voice/{folder_path}"

combine_timestamp_and_transcript(timestamp_relative_path, transcript_relative_path, text_relative_path, csv_relative_path)

com.databricks.backend.common.rpc.CommandCancelledException
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:429)
	at com.databricks.spark.chauffeur.ChauffeurState.cancelExecution(ChauffeurState.scala:1225)
	at com.databricks.spark.chauffeur.ChauffeurState.$anonfun$process$1(ChauffeurState.scala:958)
	at com.databricks.logging.UsageLogging.$anonfun$recordOperation$1(UsageLogging.scala:573)
	at com.databricks.logging.UsageLogging.executeThunkAndCaptureResultTags$1(UsageLogging.scala:669)
	at com.databricks.logging.UsageLogging.$anonfun$recordOperationWithResultTags$4(UsageLogging.scala:687)
	at com.databricks.logging.UsageLogging.$anonfun$withAttributionContext$1(UsageLogging.scala:426)
	at scala.util.DynamicVariable.withValue(DynamicVariable.scala:62)
	at com.databricks.logging.AttributionContext$.withValue(AttributionContext.scala:216)
	at com.databricks.logging.UsageLogging.withAttributionContext(UsageLogging.scala:424)
	at com.databricks.logging.Usag

### Joining together sections

In [0]:
input_relative_path = f"{base_path}/04_speaker/{folder_path}"
output_relative_path = f"{base_path}/06_section/{folder_path}"

join_sections(input_relative_path, output_relative_path)

com.databricks.backend.common.rpc.CommandCancelledException
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:429)
	at com.databricks.spark.chauffeur.ChauffeurState.cancelExecution(ChauffeurState.scala:1225)
	at com.databricks.spark.chauffeur.ChauffeurState.$anonfun$process$1(ChauffeurState.scala:958)
	at com.databricks.logging.UsageLogging.$anonfun$recordOperation$1(UsageLogging.scala:573)
	at com.databricks.logging.UsageLogging.executeThunkAndCaptureResultTags$1(UsageLogging.scala:669)
	at com.databricks.logging.UsageLogging.$anonfun$recordOperationWithResultTags$4(UsageLogging.scala:687)
	at com.databricks.logging.UsageLogging.$anonfun$withAttributionContext$1(UsageLogging.scala:426)
	at scala.util.DynamicVariable.withValue(DynamicVariable.scala:62)
	at com.databricks.logging.AttributionContext$.withValue(AttributionContext.scala:216)
	at com.databricks.logging.UsageLogging.withAttributionContext(UsageLogging.scala:424)
	at com.databricks.logging.Usag