## Install Required Python Libraries

In [1]:
# !pip install python-docx
# !pip install openai
# !pip install google-generativeai

## Import libraries

In [2]:
# Builtin libraries
import os
import sys
import re
from typing import Union
from pathlib import Path

# For reading `*.docx` documents (Word)
from docx import Document
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT

# LLM libraries (Gen-AI)
import openai
import google.generativeai as genai

  from .autonotebook import tqdm as notebook_tqdm


# Constants

In [3]:
# PATHS AND DOMAINS
FILE_LIST = os.listdir('./data')
"""List all the files in the `data` directory"""

DOC_FILE_PATH = Path("./data") / FILE_LIST[2]
"""Path to `*.docx` file"""

ORG_DOMAIN = "intelliswift.com"
"""Organisation domain (for email)"""


# API KEYS AND TOKEN CONFIG
GEMINI_API_KEY = os.environ['GEMINI_API_KEY']
"""Gemini API key"""

OPENAI_API_KEY = os.environ['OPENAI_API_KEY']
"""OpenAI API key"""

MAX_TOKEN_LENGTH = 10_000
"""Maximum length of tokens (for chunking/tokenization)"""

PROMPT_TOKEN_LIMIT = 4090
"""Token limit for the input prompt"""


# GPT CONFIG
GPT_MODEL_PARAMS = {
    'model': "gpt-3.5-turbo",
    'temperature': 0.1
}
"""Chat Completion model parameters for GPT response"""

openai.api_key = OPENAI_API_KEY


# GEMINI CONFIG
GEMINI_SAFETY_SETTINGS = [
    {
        "category": "HARM_CATEGORY_HARASSMENT",
        "threshold": "BLOCK_ONLY_HIGH"
    },
    {
        "category": "HARM_CATEGORY_HATE_SPEECH",
        "threshold": "BLOCK_ONLY_HIGH"
    },
    {
        "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
        "threshold": "BLOCK_ONLY_HIGH"
    },
    {
        "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
        "threshold": "BLOCK_ONLY_HIGH"
    }
]
"""Safety settings for content generation in Gemini"""

GEMINI_GENERATION_CONFIG = {
    "temperature": 0.25,
    "top_p": 1,
    "top_k": 1
}
"""Chat completion model parameters for Gemini"""

GEMINI_MODEL_PARAMS = {
    "model_name": "gemini-1.0-pro",
    "generation_config": genai.GenerationConfig(**GEMINI_GENERATION_CONFIG),
    "safety_settings": GEMINI_SAFETY_SETTINGS
}

genai.configure(api_key=GEMINI_API_KEY)
MODEL_GEMINI = genai.GenerativeModel(**GEMINI_MODEL_PARAMS)


# PROMPTS
LLM_PROMPT = """Instructions:
1. Read the transcript and write the summary in 2-3 lines and action items in not more than 2 points.
2. Do not include any greeting related messages in `Summarization` and `Action Items`.
3. Remove Thanks, or any other greetings related conversations wherever necessary.
4. Do not alter any `Action Item` and provide correct action item based on data and map correct User.
5. Provide summary in paragraphs instead of points.

Generate the text in the following format, strictly:
Summary: <text>
Action Items: <points>

Transcript:
{transcript_text}
"""
"""Prompt for fetching `Summary` and `Action Items`"""

'Prompt for fetching `Summary` and `Action Items`'

## Reading the Transcript `(*.docx)`

In [4]:
def read_docx(file_path: Union[Path, str]) -> str:
    """
     Function to read word documents and return as string.

     :param file_path: (Path | str) Path to Word file

     :return: (str) Document text
    """
    if isinstance(file_path, Path):
        file_path = file_path.absolute().as_posix()

    doc = Document(file_path)
    text = []

    for paragraph in doc.paragraphs:
        text.append(paragraph.text)

    return '\n'.join(text)


# Read file
try:
    doc_text = read_docx(DOC_FILE_PATH)
    print(doc_text)
except FileNotFoundError as e:
    print("File not found.", e)
except IOError as e:
    print("Error reading the file.", e)


0:0:0.0 --> 0:0:0.490
Ruchita Raut
IQ.
0:0:9.680 --> 0:0:10.570
Vivek Kadoo
Alright, good morning all.
0:0:12.990 --> 0:0:13.260
Neha Kumari
Yeah.
0:0:13.300 --> 0:0:13.530
Ruchita Raut
Yeah.
0:0:13.270 --> 0:0:13.690
Neha Kumari
Good morning.
0:0:14.310 --> 0:0:14.730
Ruchita Raut
Good morning.
0:0:14.530 --> 0:0:14.990
Piyush Jadhav
Good morning.
0:0:20.360 --> 0:0:22.210
Gangadhar Agre
Yeah, let's wait for other to join.
0:0:50.890 --> 0:0:51.700
Akash Panwar
Hello, everyone.
0:0:51.710 --> 0:0:52.140
Akash Panwar
Good morning.
0:0:53.410 --> 0:0:54.550
Gangadhar Agre
Hey, good morning, Akash.
0:1:2.430 --> 0:1:2.560
Sameer Panda
Hey.
0:1:4.670 --> 0:1:4.860
Sameer Panda
Yeah.
0:1:4.870 --> 0:1:5.190
Sameer Panda
Good morning.
0:1:1.330 --> 0:1:5.330
Gangadhar Agre
Yeah, I think we can get started 3 prisoner.
0:1:6.110 --> 0:1:6.830
Sameer Panda
Hi, good morning.
0:1:8.490 --> 0:1:8.630
Gangadhar Agre
Yeah.
0:1:8.890 --> 0:1:9.650
Sameer Panda
Yeah, let's start.
0:1:11.400 --> 0:1:1

## Cleaning the Transcript

In [5]:
# Split the text into lines
lines = doc_text.split('\n')

new_lines = []
new_lines_user_messages = []
users_list = []

for i, line in enumerate(lines):
    # Regular expression pattern with wildcard for numbers
    time_format_pattern = r'\d+:\d+:\d+\.\d+ --> \d+:\d+:\d+\.\d+'

    # Find lines matching the time format pattern
    matching_lines = re.match(time_format_pattern, line)
    if matching_lines:
        indices_to_remove = [
            i for i, line in enumerate(lines)
            if line == matching_lines.group()
        ]
        users_list.append(
            lines[indices_to_remove[0]:indices_to_remove[0] + 3][1]
        )

        if len(
            lines[indices_to_remove[0]:indices_to_remove[0] + 3][2]
            .split()
        ) >= 3:
            new_lines_user_messages.append(
                str(lines[indices_to_remove[0] + 1]) +
                " : " + 
                str(lines[indices_to_remove[0] + 2])
            )


cleaned_text = '\n'.join(new_lines_user_messages)
print(cleaned_text)

Vivek Kadoo : Alright, good morning all.
Gangadhar Agre : Yeah, let's wait for other to join.
Gangadhar Agre : Hey, good morning, Akash.
Gangadhar Agre : Yeah, I think we can get started 3 prisoner.
Sameer Panda : Hi, good morning.
Sameer Panda : Yeah, let's start.
Gangadhar Agre : So guys, today the Akash is like last working day, OK, all the very best Akash for your future journey.
Akash Panwar : One of them.
Sameer Panda : All the best, all the best talkers.
Gangadhar Agre : Yeah, it it was great working with you because whatever like from the taking your interview to the whatever we work, it was a great stuff we did, yeah.
Gangadhar Agre : And I think training also you given like fantastic and whatnot from Vivek and some make also.
Akash Panwar : I hope I was a I was of some little bit help to them.
Gangadhar Agre : Yeah, right, Vivek, your thought?
Vivek Kadoo : Yeah, it was a great session and apart from that training session, I would say working with Akash he's a very knowledgea

## Get All Participants' Names From the Transcript

In [6]:
unique_users_list = list(set(users_list))
unique_users_list

['Yogesh Rajgure',
 'Piyush Jadhav',
 'Kanchan Sharma',
 'Gangadhar Agre',
 'Vivek Kadoo',
 'Ruchita Raut',
 'Akash Panwar',
 'Dhruvesh Dilipbhai Kalathiya',
 'Sameer Panda',
 'Sri Harsha Vempaati',
 'Aniket Jaware',
 'Neha Kumari']

## Extracting Email ID's from Unique names in Transcript

In [7]:
formatted_names = []
email_addresses = []

for name in unique_users_list:
    parts = name.split()
    plen = len(parts)

    if plen > 1:
        first_name = ''.join(parts[:plen - 1])
        last_name = parts[-1]
        splitter = '.'
    else:
        first_name = parts[0]
        last_name, splitter = '', ''
    
    email_addr = f"{first_name}{splitter}{last_name}@{ORG_DOMAIN}"
    email_addresses.append(email_addr)

email_addresses

['Yogesh.Rajgure@intelliswift.com',
 'Piyush.Jadhav@intelliswift.com',
 'Kanchan.Sharma@intelliswift.com',
 'Gangadhar.Agre@intelliswift.com',
 'Vivek.Kadoo@intelliswift.com',
 'Ruchita.Raut@intelliswift.com',
 'Akash.Panwar@intelliswift.com',
 'DhruveshDilipbhai.Kalathiya@intelliswift.com',
 'Sameer.Panda@intelliswift.com',
 'SriHarsha.Vempaati@intelliswift.com',
 'Aniket.Jaware@intelliswift.com',
 'Neha.Kumari@intelliswift.com']

### Convert messages `(GPT -> Gemini)` and `(Gemini -> GPT)`

In [8]:
def convert_messages_to_gemini(convo_dicts: list[dict]) -> list[dict]:
    """
    Convert message history into Gemini input format.

    :param convo_dicts: (list[dict]) list of conversation dicts

    :return: (list[dict]) list of converted conversation dicts
    """
    gemini_convo_dict = []
    for convo in convo_dicts:
        try:
            if convo.get('parts'):
                continue

            gemini_convo_dict.append({
                'role': convo['role'] if convo['role'] == 'user' else 'model',
                'parts': [convo['content']]
            })
        except KeyError:
            gemini_convo_dict.append(convo)
    return gemini_convo_dict


def convert_messages_to_openai(convo_dicts: list[dict]) -> list[dict]:
    """
    Convert message history into GPT input format.

    :param convo_dicts: (list[dict]) list of conversation dicts

    :return: (list[dict]) list of converted conversation dicts
    """
    openai_convo_dict = []
    for convo in convo_dicts:
        try:
            if convo.get('content'):
                openai_convo_dict.append(convo)
                continue

            openai_convo_dict.append({
                'role': convo['role'] if convo['role'] == 'user' else 'assistant',
                'content': convo['parts'][0]
            })
        except KeyError:
            openai_convo_dict.append(convo)
    return openai_convo_dict

## Function to get response from LLM

In [9]:
def get_completion_from_messages(messages: list, use_gpt=False, **model_kwargs: dict[str, Union[str, int]]) -> str:
    """
    Fetches GPT response from the provided model, history and kwargs.

    Default:
        ```
        model = gpt-3.5-turbo
        temperature = 0.1
        ```

    :param messages: (list) List of messages (or history)
    :param openai_kwargs: (str: str | int) GPT model parameters

    :return: (str) chat completion response
    """
    if not model_kwargs and use_gpt is True:
        # model_kwargs = GPT_MODEL_PARAMS if use_gpt else GEMINI_MODEL_PARAMS
        model_kwargs = GPT_MODEL_PARAMS

    if use_gpt:
        messages = convert_messages_to_openai(messages)
        response = openai.chat.completions.create(messages=messages, **model_kwargs)
        return response.choices[0].message["content"]

    else:
        messages = convert_messages_to_gemini(messages)
        response = MODEL_GEMINI.generate_content(messages, **model_kwargs)
        return response.text

### Break larger text into smaller chunks

In [10]:
def split_text(text: str, max_token_length: int) -> list[str]:
    """
    Breaks large texts into smaller chunks.

    :param text: (str) the complete text, to be chunked
    :param max_token_length: (int) maximum length of each chunk

    :return: (list[str]) list of tokens from the chunked `text`
    """
    parts = []
    current_part = ""
    tokens = text.split()

    for token in tokens:
        if len(current_part) + len(token) + 1 <= max_token_length:
            current_part += token + " "
        else:
            parts.append(current_part.strip())
            current_part = token + " "

    if current_part:
        parts.append(current_part.strip())

    return parts

### Print smaller chunks `Summary` and `Action Items` separately

In [11]:
def clean_summary_and_points(summaries: list, action_items: list) -> tuple[list[str]]:
    """
    Cleans the `Summaries` and `Action Items`. Removes redundant newline
    characters and number list points.

    :param summaries: (list) a list of summaries
    :param action_items: (list) a list of action items

    :return: (tuple[list[str]]) cleaned summaries and action items
    """

    def clean_new_point(s: str) -> str:
        pattern = r'\n\d{1,2}'
        replacement = '\n'

        processed_text = re.sub(pattern, replacement, s)
        output_str = re.sub('\n+', '\n', processed_text)

        return output_str[:-1]

    clean_new_lines = lambda s: s.replace('\n', '')

    return (
        list(map(clean_new_lines, summaries)),
        list(map(clean_new_point, action_items))
    )

### Merging all into single `Summary` and `Action Items`

In [12]:
prompt = LLM_PROMPT.format(transcript_text=cleaned_text)
total_tokens = len(prompt.split())
print("Total prompt tokens (input):", total_tokens)

if total_tokens <= PROMPT_TOKEN_LIMIT:
    context = [{
        'role':'user',
        'content': prompt
    }]
    minutes_of_meeting = get_completion_from_messages(context)
    print("Minutes of Meeting:\n", minutes_of_meeting)

else:
    print(f"Prompt too large. Getting outputs per {MAX_TOKEN_LENGTH} chunks...", file=sys.stderr)
    response_list = []

    ct_parts = split_text(
        text=cleaned_text,
        max_token_length=MAX_TOKEN_LENGTH
    )
    len_ct_parts = len(ct_parts)

    # Print the split parts
    for i, part in enumerate(ct_parts):
        prompt = LLM_PROMPT.format(transcript_text=part)
        context = [{
            'role': 'user',
            'content': prompt
        }]
        responses = get_completion_from_messages(context)
        print(f"Iteration: {i + 1}/{len_ct_parts}")
        response_list.append(responses)

    # Merge responses into a single string
    merged_response = '\n\n'.join(response_list)

    summaries = []
    action_items = []

    detail_text = merged_response.split("Summary:")
    for i, block in enumerate(detail_text):
        if len(block) < 10:
            continue
        else:
            part = block.split('Action Items:')
            summaries.append(part[0].strip())
            action_items.append(part[1])


    summaries_clean, action_items_clean = clean_summary_and_points(summaries, action_items)

    # Merge all summaries
    summary_merged = '\n'.join(summaries_clean)

    # Merge all action items
    action_items_merged = ''.join(action_items_clean)

    # Enumerate action items
    numbered_action_items = ""
    action_items_list = action_items_merged.split('\n')
    for i, item in enumerate(action_items_list[1:], 1):
        # Remove leading dots and spaces
        item = item.strip('.').strip(' ')
        numbered_action_items += f"{i}. {item}\n"

    summary_formatted = f"Summary:\n{summary_merged}"
    action_items_formatted = f"Action Items:\n{numbered_action_items}"

    minutes_of_meeting = summary_formatted + '\n\n' + action_items_formatted
    print("\nMinutes of Meeting:\n", minutes_of_meeting)

Total prompt tokens (input): 5675


Prompt too large. Getting outputs per 10000 chunks...


Iteration: 1/3
Iteration: 2/3
Iteration: 3/3

Minutes of Meeting:
 Summary:
The team held a farewell meeting for Akash, who was leaving the company. They expressed their appreciation for his contributions, particularly in the areas of dashboarding and visualization. The team also discussed ongoing projects, including the development of a storyline for the data center CPU and the automation of resume formatting using NLP.
The team discussed the development of a tool to assist HR in screening resumes by matching them to job descriptions and extracting relevant skills. The tool aims to reduce the manual effort involved in resume screening and improve the accuracy of candidate selection. The team also discussed the progress of various projects, including training initiatives, data center visualization, and carbon footprint calculation.
The team discussed the development of APIs for extracting data from resumes, including basic details and specific information like name, email, and ID. They

## Writing the Output in a word Document

In [13]:
document = Document()
document.add_heading('Meeting Minutes\n', level=1).alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
document.add_paragraph(minutes_of_meeting)  # Combined summary and action items
original_file_name = "_".join(DOC_FILE_PATH.name.split(".")[0:-1])
document.save(f'./output/{original_file_name}-meeting_minutes.docx')