# LOADING / INSTALLING / IMPORTING

In [None]:
from google.colab import drive
from transformers import pipeline
from scipy import spatial
import numpy as np
import torch
import pandas as pd
#drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


https://www.gradio.app/guides/real-time-speech-recognition

In [None]:
! pip install -U sentence-transformers



In [None]:
! pip install gradio -q

In [None]:
! pip install text-generation langchainhub langchain_community langchain python-dotenv gradio -q

In [None]:
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv("token.env")) # read local .env file
hf_api_key = os.environ['HUGGINGFACEHUB_API_TOKEN']
os.environ['TRANSFORMERS_CACHE'] = './cache/'

In [None]:
from huggingface_hub import notebook_login
notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from langchain_community.llms import HuggingFaceHub
from langchain.schema import (
    HumanMessage,
    SystemMessage,
    AIMessage
)
from langchain_community.chat_models.huggingface import ChatHuggingFace


llm = HuggingFaceHub(
    repo_id="mistralai/Mistral-7B-Instruct-v0.2",
    task="text-generation",
    model_kwargs={
        "max_new_tokens": 512,
        "top_k": 30,
        "temperature": 0.1,
        "repetition_penalty": 1.03,
    },
)

# LOADING MODELS

In [None]:
import gradio as gr
from sentence_transformers import SentenceTransformer

#sentence embeddings model
transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en")
sentence_transformer_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

#classification model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
classification_model = pipeline("text-classification", model="bert_large_uncased_goemotions", return_all_scores=True, device=-1 if device.type=="cpu" else 0) #bert_large_uncased_goemotions_3_epochs_may_be_better

#vectors for similar comments
df = pd.read_csv("data/train.tsv", sep="\t", names=["text", "labels", "id"])
comments_list = df["text"].tolist() #comments
#embeddging_array = sentence_transformer_model.encode(comments_list, show_progress_bar=True) #do not run again
#np.save("comment_vectors.npy", embeddging_array)

#LLM model for summarization
summarization_model = ChatHuggingFace(llm=llm)



# FUNCTIONS

In [None]:
def predict_emotions(comment_text_):
    """Here we predict emotions using fine-tuned transformer model.
    """
    all_classes_scored = classification_model(comment_text_)
    label_names = open("data/emotions.txt", "r").read().splitlines()

    predictions = []
    threshold = 0.35
    max_pred = 0
    max_emotion_pred =""
    for pred in all_classes_scored[0]:
        if pred["score"] > threshold:
            predictions.append((pred["label"], pred["score"]))
        if pred["score"] > max_pred:
            max_pred = pred["score"]
            max_emotion_pred = pred["label"]

    predicted_emotions = [label_names[int(pred[0])] for pred in predictions]

    if predictions == []:
        predicted_emotions = [label_names[int(max_emotion_pred)]] #get highest prob emotion if the threshold is too high and we get no emotions

    return predicted_emotions



def get_similar_comments(comment_text_, top_n_comments=6, use_threshold=True, threshold=0.25):
    """Here we retrieve comments from dev/train set which are similar to a given comment text.
    """
    # Load your pre-calculated sentence-embedding for the training and dev set (at least 10000 examples)
    df = pd.read_csv("data/train.tsv", sep="\t", names=["text", "labels", "id"])
    embeddging_array = np.load("comment_vectors.npy")

    # Calculate sentence-embedding for the comment_text_
    input_embedding = sentence_transformer_model.encode(comment_text_)

    # Use spatial.distance.cdist() from Word embeddings.ipynb to calculate similarities between comments
    distances = spatial.distance.cdist([input_embedding], embeddging_array, "cosine")[0]
    similarity_scores = 1 - distances

    # Select top_n most similar commentsm and return them in a list that shall include corresponding emotions and similarity score
    '''
    similar_indices = distances.argsort()[:top_n_comments]
    '''

    if use_threshold:
        filtered_indices = [i for i, score in enumerate(similarity_scores) if score >= threshold]
    else:
        filtered_indices = range(len(similarity_scores))


    sorted_indices = sorted(filtered_indices, key=lambda i: similarity_scores[i], reverse=True)
    top_indices = sorted_indices[:top_n_comments]

    most_similar_comments = [{
        "text": comments_list[i].strip(),
        "emotions": predict_emotions(comments_list[i]),
        "similarity_score": similarity_scores[i]
    } for i in top_indices]

    return most_similar_comments



def get_summary(top_n_similar_comments_):
    """Here we generate a summary of a group of comments discussing the overall topic and general emotions in this group.
    """
    comments = [ label['text'] for label in top_n_similar_comments_]

    prompt = '''
    The following are a set of similar comments. Summarize the main points and describe the general emotions expressed in these comments.

    Comments:
    %s

    Your summary should be concise and capture the overall sentiment and key points discussed in the comments.
    '''% ("\n".join(comments))

    messages = [
        HumanMessage(content="Hello"),
        AIMessage(content="How can I help you?"),
        HumanMessage(
            content=prompt
        ),
    ]
    res = summarization_model.invoke(messages)
    summary = res.content.split("INST]")[-1].strip()

    return summary



#sometimes we get a summary saying similar things as the examples -> i.e. "There are no issues or negativity in these comments", which is learned because of the example -> solved in prompt
def get_summary_few_shot(top_n_similar_comments_):
    """Here we generate a summary of a group of comments discussing the overall topic and general emotions in this group.
    """
    comments = [label['text'] for label in top_n_similar_comments_]

    few_shot_examples = [
        {
            "comments": [
                "I love this product! It works perfectly and exceeds my expectations.",
                "This is the best purchase I've made in a long time. Highly recommend it!",
                "Fantastic quality and great value for money. Very satisfied."
            ],
            "summary": "The comments are very positive, they show emotions like excitement or surprise, highlighting satisfaction with the product's performance, quality, and value for money."
        },
        {
            "comments": [
                "I'm disappointed with the service. The delivery was late and the package was damaged.",
                "Terrible experience. Will not be ordering from here again.",
                "Customer support was unhelpful and rude. Very dissatisfied."
            ],
            "summary": "The comments express dissatisfaction with the service, citing issues with delivery, package condition, and customer support. We can also see a sentiment of anger and disappointment due to the poorness of the sevice."
        },
        {
            "comments": [
                "I just found out that my best friend is moving away. I'm heartbroken.",
                "It's been a rough day, everything seems to be going wrong.",
                "I can't believe my pet is sick. I don't know what to do."
            ],
            "summary": "The comments convey sadness and grief, with individuals expressing heartbreak over a friend's move, a generally bad day, and concern for a sick pet."
        },
        {
            "comments": [
                "I saw a great documentary on climate change today. It was very informative.",
                "Just finished reading a book about the history of technology. Fascinating stuff.",
                "Spent the afternoon learning about ancient civilizations. Really interesting!"
            ],
            "summary": "The comments are neutral and informative, showing curiosity and interest in topics such as climate change, the history of technology, and ancient civilizations."
        }
    ]

    few_shot_prompt = "Here are some examples of comments and their summaries:\n\n"
    for example in few_shot_examples:
        few_shot_prompt += "Comments:\n" + "\n".join(example["comments"]) + "\n"
        few_shot_prompt += "Summary: " + example["summary"] + "\n\n"


    few_shot_prompt += '''
    These summaries are just examples of some set of specific comments.
    Your job now is to summarize the main points and
    describe the general emotions expressed in the following comments.

    Comments:
    %s

    The possible emotions you can detect are the following: ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment',
    'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness',
    'surprise', 'neutral']. Your summary should be concise and capture the overall sentiment, emotions and key points discussed in the comments.
    ''' % ("\n".join(comments))

    messages = [
        HumanMessage(content="Hello"),
        AIMessage(content="How can I help you?"),
        HumanMessage(content=few_shot_prompt),
    ]
    res = summarization_model.invoke(messages)
    summary = res.content.split("INST]")[-1].strip()

    return summary




def analyze_comment(stream, new_chunk, input_text, zero_shot=False):
    if new_chunk:
        sr, y = new_chunk
        y = y.astype(np.float32)
        y /= np.max(np.abs(y))

        if stream is not None:
            stream = np.concatenate([stream, y])
        else:
            stream = y

    # if input_text is filled with a text, we won't apply audio transcriber
    comment_text = None
    if not input_text:
        comment_text = transcriber({"sampling_rate": sr, "raw": stream})["text"]
    else:
        comment_text = input_text

    emotions = predict_emotions(comment_text)
    emotions_as_text = ", ".join(emotions) #emotions
    similar_comments = get_similar_comments(comment_text, top_n_comments=7) #with top 5 comments the summary maybe focus too much on the content of them (with 'cake day' starts talking about reddit). top 10 tends to delirate with extra info in the similar comments

    if zero_shot:
        generated_summary = get_summary(similar_comments)
    else:
      generated_summary = get_summary_few_shot(similar_comments)

    similar_comments_as_text = "\n\n".join([("\nText: %s \nEmotions: %s \nSimilarity score: %.2lf") %
     (comment["text"], ", ".join(comment["emotions"]), comment["similarity_score"]) for comment in similar_comments])

    return stream, comment_text, emotions_as_text, similar_comments_as_text, generated_summary


# EXAMPLE ZERO-SHOT

In [None]:
stream, comment_text, emotions_as_text, similar_comments_as_text, generated_summary = analyze_comment(
    stream=None, new_chunk=None, input_text="was he here?"
)

#printing results with line breaks
print("Comment Text:", comment_text)
print("Emotions:", emotions_as_text)
print("\nSimilar Comments:\n", similar_comments_as_text)
print("\n\nSummary:", generated_summary)

Comment Text: was he here?
Emotions: neutral, curiosity

Similar Comments:
 
Text: Did he died? 
Emotions: curiosity 
Similarity score: 0.63


Text: was....was he not? 
Emotions: neutral 
Similarity score: 0.57


Text: Is this him? 
Emotions: curiosity 
Similarity score: 0.57


Text: This guys still alive? 
Emotions: curiosity 
Similarity score: 0.52


Text: Was he in your butt? 
Emotions: neutral, curiosity 
Similarity score: 0.52


Text: Hahaha was this the guy, OP? 
Emotions: amusement, curiosity 
Similarity score: 0.49


Text: Was he good back then? 
Emotions: curiosity 
Similarity score: 0.48


Summary: The comments express a high level of confusion and curiosity, as individuals question the identity and status of a person mentioned in the post. There is also a hint of amusement due to the unusual nature of the questions. Overall, the comments do not convey any strong positive or negative emotions.


# EXAMPLE FEW-SHOT

In [None]:
stream, comment_text, emotions_as_text, similar_comments_as_text, generated_summary = analyze_comment(
    stream=None, new_chunk=None, input_text="I'm cooking eggs today", zero_shot=False
)

#printing results with line breaks
print("Comment Text:", comment_text)
print("Emotions:", emotions_as_text)
print("\nSimilar Comments:\n", similar_comments_as_text)
print("\n\nSummary:", generated_summary)

Comment Text: I'm cooking eggs today
Emotions: neutral

Similar Comments:
 
Text: Fried Egg is my #1 since cricket cafe stopped doing breakfast sandwiches. 
Emotions: neutral 
Similarity score: 0.59


Text: Fry a egg with that bad boy. And you got yourself a nice breakfast. 
Emotions: neutral 
Similarity score: 0.57


Text: I've never decided to have just one late night drunk egg 
Emotions: disapproval 
Similarity score: 0.57


Text: eggs last forever 
Emotions: neutral 
Similarity score: 0.56


Text: You can pry my eggs from my cold, dead, artery-clogged hands. 
Emotions: neutral 
Similarity score: 0.56


Text: We had a great thread of egg jokes in Slack this afternoon, after the whole "can this egg get more likes than [NAME]?" thing. 
Emotions: amusement 
Similarity score: 0.55


Text: Look at this fancy guy being able to afford eggs. 
Emotions: neutral 
Similarity score: 0.55


Summary: The comments express a strong affection and admiration towards eggs, with individuals sharing the

# INTERFACE

In [None]:
demo = gr.Interface(
    analyze_comment,
    inputs = ["state", gr.Audio(sources=["microphone"], streaming=False), "text"],
    outputs = ["state", gr.Textbox(label="Test comment"),
               gr.Textbox(label="Predicted emotions"),
               gr.Textbox(label="Similar comments"),
               gr.Textbox(label="Summary")],
    live=False,
)

demo.launch()

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://541d9c8dc08bcccde6.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




In [None]:
y