<a href="https://colab.research.google.com/github/sammsc/meeting-notes-ai/blob/main/temi2summary.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Setup environment

In [1]:
!pip install transformers
import re
from pathlib import Path
import pandas as pd
from transformers import pipeline
from nltk.tokenize import RegexpTokenizer
import nltk.data

nltk.download('punkt')

Collecting transformers
  Downloading transformers-4.14.1-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 4.0 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 43.1 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 54.3 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 501 kB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 33.6 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
 

True

#Setup google drive

In [2]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


#Read in transcript

In [3]:
path_root = Path('/gdrive/My Drive/meeting_notes_ai')
file_txt = 'transcript_with_speaker.txt'
with open(Path(path_root / file_txt)) as fp:
    text_all = fp.readlines()

#Setup summarization model
###Here the model can be changed with customized model

In [None]:
summarizer = pipeline("summarization", model="philschmid/bart-large-cnn-samsum")

Downloading:   0%|          | 0.00/1.59k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.51G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/300 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/780k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

#Parse transcript into dataframe by speaker segment

In [4]:
df_transcript = pd.DataFrame(columns=['speaker', 'time', 'text', 'summary'])

def fill_df():
    global df_transcript
    # txt_summary = summarizer(txt_orig, max_length=90, min_length=25, do_sample=False)
    txt_summary = ''
    df_transcript = df_transcript.append(pd.Series(
        [speaker_id, speaker_time, txt_orig, txt_summary],
        index=['speaker', 'time', 'text', 'summary']),
        ignore_index=True)

In [5]:
first_line = True

for line_idx, line_cur in enumerate(text_all):
    if len(line_cur.strip()) != 0:
        # matching speaker ID line
        m_res = re.search(r'^Speaker (?P<id>\d+) \((?P<time>\d+:\d+)\):\s+\Z', line_cur)
        if m_res:
            if first_line:
                first_line = False
            else:
                fill_df()
            speaker_id = m_res.group('id')
            speaker_time = m_res.group('time')
            txt_orig = ''
        else:
            txt_orig += line_cur.rstrip('\n')

fill_df()

#Group text by speaker

In [6]:
df_txt_all = df_transcript.groupby(['summary'])['text'].apply(' '.join).reset_index()
df_by_speaker = df_transcript.groupby(['speaker'])['text'].apply(' '.join).reset_index()
txt_all = df_txt_all.loc[0, 'text']

#Calculate speaker stats

In [7]:
tokenizer = RegexpTokenizer(r'\w+')

word_count = []
for row in df_by_speaker.itertuples():
    tokens = tokenizer.tokenize(row.text)
    word_count.append(len(tokens))

df_by_speaker['word_count'] = word_count
df_by_speaker['word_percent'] = (df_by_speaker['word_count'] /
                  df_by_speaker['word_count'].sum()) * 100
df_speak_counts = df_transcript.groupby(['speaker']).size().reset_index(name='speak_counts')
df_by_speaker = df_by_speaker.merge(df_speak_counts, how='outer', on='speaker')

#Setup nltk tokenizer for splitting text by sentence

In [None]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

#Function to split text into chunks and summarize each chunk

In [None]:
''' 
Split long text into chunks of sentences, with each chunk not exceeding number
of characters in char_lim. Each chunk is variable in length since the split is 
done by sentence not character. Successive chunks can overlap each other, with
amount of overlap ranging from 0 (no overlap) to 1 (complete overlap). Each chunk 
is passed into summarizer to get its summary. Text is parsed from bottom to top 
order. The summaries are concatenated into one string as the return value.

txt_input: a single string to be summarized
char_lim: the upper limit of characters for each chunk 
frac_overlap: the fraction of overlap between successive chunks

'''
def split_summarize(txt_input, char_lim=1000, frac_overlap=0.0):
    if char_lim < 0:
      char_lim = 0
    if frac_overlap < 0:
      frac_overlap = 0
    elif frac_overlap > 0.8:
      frac_overlap = 0.8
      
    char_overlap = char_lim * frac_overlap
    sentence_list = tokenizer.tokenize(txt_input)
    sentence_list_rev = list(reversed(sentence_list))
    sentence_ttl = len(sentence_list_rev)
    char_ct = 0
    summary_input = []
    summary_output = []
    temp_sentence_rev = []

    def get_summary(list_sentence_rev):
        cur_input = ' '.join(reversed(list_sentence_rev))
        summary_input.append(cur_input)
        cur_output = summarizer(cur_input, max_length=90, min_length=25, do_sample=False)
        summary_output.append(cur_output[0]['summary_text'])

    for sen_idx, sen_cur in enumerate(sentence_list_rev):
        char_ct += len(sen_cur)
        temp_sentence_rev.append(sen_cur)

        if (sen_idx + 1) < sentence_ttl:
            if (char_ct + len(sentence_list_rev[sen_idx + 1])) > char_lim:
                get_summary(temp_sentence_rev)
                while char_ct > char_overlap:
                    sen_del = temp_sentence_rev.pop(0)
                    char_ct -= len(sen_del)

    get_summary(temp_sentence_rev)
    final_output = ' '.join(reversed(summary_output))
    return final_output

#Get summary for each speaker

In [None]:
speaker_summary = []
for row in df_by_speaker.itertuples(index=True, name='Pandas'):
  summary_cur = split_summarize(row.text, char_lim=2000, frac_overlap=0.1)
  speaker_summary.append(summary_cur)

df_by_speaker['summary'] = speaker_summary

df_by_speaker.head()

Your max_length is set to 90, but you input_length is only 73. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=36)


#Get summary for entire transcript

In [None]:
summary_all = split_summarize(txt_all, char_lim=2000, frac_overlap=0.1)

print(summary_all)

There was a product marketing meeting. There will be some seats for Forrester and Gartner. There is an OKR epic somewhere, and if anybody can find it and link it, they can add their thoughts on OKRs there. Cormac and Cindy use logins to log in and grab a report whenever they want. Forrester has an inquiry analytics tool and a data product for forest. Gartner has a search channel tool. Some of the people on the call today have a license for Forrester and three for Gartner. They get broader access to the core data and a buyers to journey tool. Some of the tools aren't interactive, so someone could log in there and check it out. Ryan wants to get a search analytics license for his team. He would like to be able to see a report or hear a summary of what someone else had logged in and done. Product managers have access to search analytics. Komack and Komack were on Parker's social call and talked about the definition of enterprise. Komack removed the term enterprise from the theme in the re

#Saving results to file

In [None]:
df_transcript.to_csv(Path(path_root / 'df_transcript.csv'), index=False)
df_transcript.to_pickle(Path(path_root / 'df_transcript.pkl'))

df_by_speaker.to_csv(Path(path_root / 'df_by_speaker.csv'), index=False)
with open(Path(path_root / "summary.txt"), "w") as text_file:
    text_file.write(summary_all)