<a href="https://colab.research.google.com/github/sergiomar73/nlp-google-colab/blob/main/qc_nlp_004_transcription_classifier_with_roberta.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install sentence_transformers

In [None]:
!pip install gradio

In [None]:
!pip install sentence_transformers

In [5]:
import gradio as gr
from sentence_transformers import SentenceTransformer, util
import numpy as np
import pandas as pd
import spacy
import torch
import plotly.express as px
from functools import cmp_to_key

In [None]:
model = SentenceTransformer('paraphrase-distilroberta-base-v2')

In [7]:
nlp = spacy.load("en_core_web_sm")

In [8]:
def transcript_to_sentences(transcript):
  doc = nlp(transcript)
  sentences = [ sentence.text for sentence in list(doc.sents) ]
  # print(sentences[:3])
  return sentences

In [9]:
def calculate_embeddings_with_roberta(text):
  embeddings = model.encode(text, convert_to_tensor=True)
  return embeddings

In [32]:
def process_categories(categories,verbose=False):
    categories.splitlines( )
    categories_list = [s.strip() for s in categories.splitlines() if s.strip()]
    df_phrases = pd.DataFrame(columns=['order', 'category', 'label', 'example'])
    for i, category in enumerate(categories_list):
        parts = category.split('=')
        label = parts[0]
        for example in parts[1].split(';'):
            new_row = { 
            'order': i + 1,
            'category': label,
            'label': label,
            'example': example
            }
            df_phrases = df_phrases.append(new_row, ignore_index=True)
    df_phrases["embeddings"] = None
    for i, row in df_phrases.iterrows():
        if verbose:
          print(f'Calculating embedding for [{ row["label"] }] {row["example"]}...')
        embedding = calculate_embeddings_with_roberta(row["example"])
        df_phrases.at[i, "embeddings"] = embedding
    # Split Phrases by Category
    df_category_list = [ x for _, x in df_phrases.sort_values(['order','label'],ascending=False).groupby('label') ]
    df_category_list.sort(key=cmp_to_key(lambda x, y: 1 if x.iloc[0,0] > y.iloc[0,0] else -1))
    # print(f"{len(df_category_list)} categories")
    return df_category_list

In [38]:
categories = """Hello=Hello, how are you doing today?;Hi, everybody;Hi;My name's Johnny
What=most advanced conversation intelligence and AI powered coaching platform;a software platform that helps people reach their potential;for communicating and connecting;empowered by behavioral science;uses artificial intelligence;drives performance outcomes for customer facing teams;help them sell more;help them deliver better experiences
How=integrated into video conference platforms;record yourself;prompts with fun and realistic experiences;That's all we need you to do;have conversations like the types you have every day;we've built technology;looks at the words that you say;what you do with your voice;your face;your gestures;simple input;create benchmarking;scoring feedback;personalized guidance;understand how you come across;relay to you how you're doing;how you can get better;coach you using artificial intelligence;as if the world's best communication coach was sitting there with you in every one of your conversations;telling you how to get better;telling you how to optimize your behavior
Who=everyone is having conversations;if you're an entry-level person;If you’re the most senior executive;the most powerful group that we can help are customer facing teams;sales teams, customer service teams, customer support, and customer success teams;anyone who is talking to the customer as a core part of their day-to-day job;words that they use, the way that they present themselves, how they come across;has an impact on the performance outcomes of the organization;giving coaching to them;coaching them at scale that is standardized and scientific;It's hard to find time for the managers to coach;It's hard for us to give feedback;It's hard for people to feel empowered, to work on something that is personal and private in a safe space
Impact=customer experience;communicating with the customer;spending time;spending a lot of time communicating;With other members of your team;internal communication and external communication;we want to make you remarkably better;we want to make you extraordinary at that behavior;meant to be learned along the way;No one at work can stop you for six months, 12 months, 18 months helping you become exceptional;your leadership journey;this is the most important skill right now for you;they don't have the time and they don't know how to do it;we can provide intentional guidance;deliver that on a personal basis for every single member of your organization that faces the customer;30% better at connecting with other people;measured by evidence-based research;the impact that that could have on individual performance;your individual growth as a leader;collective performance of the teams in the organization;make you be perceived as trustworthy, authentic, credible;improve how you connect
Bye=Bye;And I look forward to talking to you again soon. Thank you very much"""
categories_list = process_categories(categories)
print(categories_list[0][:3])

  order category  label                          example  \
0     1    Hello  Hello  Hello, how are you doing today?   
1     1    Hello  Hello                    Hi, everybody   
2     1    Hello  Hello                               Hi   

                                          embeddings  
0  [tensor(0.4046), tensor(0.9384), tensor(0.0146...  
1  [tensor(0.2691), tensor(0.8238), tensor(0.1047...  
2  [tensor(0.9531), tensor(0.8997), tensor(-0.060...  


In [193]:
def compare_text(transcript, categories, threshold):
    # Sentences
    sentences = transcript_to_sentences(transcript)
    embeddings = model.encode(sentences, convert_to_tensor=True)
    # Categories
    df_category_list = process_categories(categories)
    df_cosines = pd.DataFrame()
    df_results = pd.DataFrame(columns=['line', 'sentence', 'phrase', 'category', 'similarity'])
    # df_cosines['line'] += 1
    for _, df_category in enumerate(df_category_list):
        df_category.reset_index(drop=True, inplace=True)
        phrases_list = df_category["embeddings"].values.tolist()
        phrases = torch.stack(phrases_list)
        cosine_scores = util.cos_sim(embeddings, phrases).numpy()
        max_scores = np.max(cosine_scores, axis=1)
        df_cosines[df_category.iloc[0,2]] = max_scores
        for num_sentence, scores in enumerate(cosine_scores):
            for num_phrase, score in enumerate(scores):
                if score >= threshold:
                    new_row = { 
                        'line': num_sentence + 1,
                        'sentence': sentences[num_sentence],
                        'phrase': df_category.at[num_phrase,'example'],
                        'category': df_category.at[num_phrase,'label'],
                        'similarity': score
                    }
                    df_results = df_results.append(new_row, ignore_index=True)
    
    df_results = df_results.sort_values(['line','similarity'],ascending=[True,False])

    df_summary = pd.DataFrame(df_cosines.max(numeric_only=True),columns=['similarity'])
    df_summary['ok'] = np.where(df_summary['similarity'] > threshold, True, False)

    fig = px.bar(
        df_summary,
        y='similarity',
        color='ok',
        color_discrete_map={ True: px.colors.qualitative.Plotly[2], False: px.colors.qualitative.Set2[7] },
        text='similarity',
        text_auto='.3f',
        labels={'tag': 'Category', 'similarity': 'Similarity'},
        title = f"{transcript[:200]}..."
    )
    fig.add_shape(
        # add a horizontal "target" line
        type="line", line_color="salmon", line_width=3, opacity=1, line_dash="dot",
        x0=0, x1=1, xref="paper", y0=threshold, y1=threshold, yref="y"
    )
    fig.update_traces(textfont_size=24, textangle=0, textposition="inside", cliponaxis=False)
    fig.update_yaxes(range=[0, 1])  

    # details = df_results #.drop(labels='line',axis=1).sort_values(['tag','similarity'],ascending=[True,False]).groupby('tag').head(3).reset_index()    .drop(labels='index',axis=1)
  
    res = df_summary['similarity'].to_dict()    
    return res, fig, df_results

In [194]:
threshold = 0.5
transcription = "Hello, how are you doing today? I'm here to tell you a little bit about, uh, quantified communications and the quantified platform and how it impacts organizations, who it helps and how it works. So I'll get started off by telling you just a little bit about a high level about, um, the quantified platform. Oh, so the quantified platform is one of the most advanced communication intelligence in AI powered coaching systems."
res, _, details = compare_text(transcription,categories,threshold)

In [195]:
res

{'Hello': 1.0000001192092896,
 'What': 0.7428328990936279,
 'How': 0.5351963043212891,
 'Who': 0.3737223148345947,
 'Impact': 0.3667849600315094,
 'Bye': 0.44503074884414673}

In [196]:
details

Unnamed: 0,line,sentence,phrase,category,similarity
0,1,"Hello, how are you doing today?","Hello, how are you doing today?",Hello,1.0
1,1,"Hello, how are you doing today?","Hi, everybody",Hello,0.518189
4,1,"Hello, how are you doing today?",relay to you how you're doing,How,0.517791
2,4,"Oh, so the quantified platform is one of the m...",most advanced conversation intelligence and AI...,What,0.742833
5,4,"Oh, so the quantified platform is one of the m...",coach you using artificial intelligence,How,0.535196
3,4,"Oh, so the quantified platform is one of the m...",uses artificial intelligence,What,0.518865
