In [None]:
!python -m spacy download en_core_web_sm

In [None]:
!pip install transformers torch librosa soundfile spacy pydub gradio tensorflow tensorflow_hub speechbrain torchaudio

In [None]:
# import os
# os.kill(os.getpid(), 9)

In [None]:
!wget https://raw.githubusercontent.com/tensorflow/models/master/research/audioset/yamnet/yamnet_class_map.csv
!pip install openai-whisper
!pip install git+https://github.com/openai/whisper.git
!pip install git+https://github.com/speechbrain/speechbrain.git@develop
# !pip install torch

In [None]:
import whisper

# Load the Whisper model
whisper_model = whisper.load_model("base")

def get_text_from_audio(audio_file):
    file_path = convert_to_wav(audio_file)  # Convert to wav for Whisper compatibility
    result = whisper_model.transcribe(file_path)
    transcription = result['text']
    return transcription

100%|███████████████████████████████████████| 139M/139M [00:02<00:00, 55.0MiB/s]
  checkpoint = torch.load(fp, map_location=device)


In [None]:
emotion_mapping = {
    "LABEL_0": "sadness",
    "LABEL_1": "joy",
    "LABEL_2": "love",
    "LABEL_3": "anger",
    "LABEL_4": "fear",
    "LABEL_5": "surprise"
}

In [None]:
import gradio as gr
import torch
import librosa
import soundfile as sf
import torchaudio
from transformers import pipeline
import whisper
import spacy
import numpy as np
from pydub import AudioSegment

# Load models
whisper_model = whisper.load_model("base")
nlp = spacy.load("en_core_web_sm")

summarizer = pipeline("summarization", model="t5-base", device=0)
sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english", device=0)
emotion_recognizer = pipeline("text-classification", model="mrm8488/t5-base-finetuned-emotion", device=0)

def convert_to_wav(file_path):
    if not file_path.endswith(".wav"):
        audio = AudioSegment.from_file(file_path)
        wav_path = "converted_temp.wav"
        audio.export(wav_path, format="wav")
        return wav_path
    return file_path

def split_audio_into_chunks(file_path, chunk_length=30):
    audio = AudioSegment.from_wav(file_path)
    chunk_length_ms = chunk_length * 1000  # Convert to milliseconds
    chunks = [audio[i:i + chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]
    return chunks

def transcribe_chunk(chunk):
    temp_path = "temp_chunk.wav"
    chunk.export(temp_path, format="wav")
    result = whisper_model.transcribe(temp_path)
    return result['text']

def get_combined_transcription(audio_file, chunk_length=30):
    """Split the audio file into chunks and transcribe each chunk."""
    file_path = convert_to_wav(audio_file)
    chunks = split_audio_into_chunks(file_path, chunk_length=chunk_length)

    combined_transcription = ""
    for chunk in chunks:
        transcription = transcribe_chunk(chunk)
        combined_transcription += transcription + " "

    return combined_transcription.strip()

def classify_text_emotion(text):
    result = emotion_recognizer(text)
    raw_emotion = result[0]["label"]

    # Map the raw emotion label (e.g., "LABEL_1") to a human-readable emotion
    emotion = emotion_mapping.get(raw_emotion, "unknown emotion")

    return emotion

def generate_descriptive_insights(transcription, audio_emotion):
    # Truncate the transcription to 512 tokens for sentiment analysis
    truncated_transcription = transcription[:512]

    # Determine the appropriate max_length based on the transcription length
    max_length = min(len(truncated_transcription.split()), 80)
    min_length = min(len(truncated_transcription.split()), 25)

    # Step 1: Summarize the content
    summary = summarizer(truncated_transcription, max_length=max_length, min_length=min_length, do_sample=False)
    summarized_text = summary[0]['summary_text']

    # Step 2: Analyze the sentiment
    sentiment = sentiment_analyzer(truncated_transcription)
    mood = sentiment[0]['label'].lower()

    # Step 3: Generate descriptive insights considering both sentiment and emotion
    mood_description = f"Based on the transcription, the speaker seems to have a {mood} mood. "
    mood_description += f"The text analysis indicates the speaker's emotion is {audio_emotion}. "
    mood_description += f"Topic discussed: {summarized_text}"

    return mood_description

def process_audio_and_text(audio, text=None):
    """Process the audio and optional text input, then derive insights."""
    # Step 1: Transcribe the audio file in chunks and combine the transcriptions
    transcription = get_combined_transcription(audio)

    # Step 2: Perform text-based emotion analysis
    audio_emotion = classify_text_emotion(transcription)

    # Step 3: Handle optional text input (if provided)
    if text and text.strip():
        doc = nlp(text)
        if "mood" in text.lower() or "insight" in text.lower() or "saying" in text.lower() or "implying" in text.lower():
            # Generate descriptive insights based on transcription and audio emotion
            descriptive_insights = generate_descriptive_insights(transcription, audio_emotion)
            return transcription, {"Descriptive Insight": descriptive_insights}
        elif "genre" in text.lower():
            return transcription, {"Genre": "Music"}  # Simplified for now
        else:
            text_insights = derive_insights_from_text(transcription)
            return transcription, text_insights

    # Default behavior when no text is provided
    insights = derive_insights_from_text(transcription)
    return transcription, {"Audio Emotion": audio_emotion, "Insights": insights}

def derive_insights_from_text(text):
    """Derive insights using text analysis (entity extraction, keywords)."""
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    keywords = [chunk.text for chunk in doc.noun_chunks]

    insights = {
        "Extracted Entities": entities,
        "Key Phrases": keywords
    }

    return insights

# Gradio Interface
with gr.Blocks() as demo:
    with gr.Row():
        audio_input = gr.Audio(type="filepath", label="Upload Audio File")
        optional_text = gr.Textbox(lines=2, label="Optional Text Input (e.g., 'What is the mood?' or 'What is the genre? or the insights')")

    with gr.Row():
        audio_transcription = gr.Textbox(label="Transcription", placeholder="Audio transcription will appear here.")
        output_insights = gr.JSON(label="Derived Insights or Answer")

    audio_submit = gr.Button("Process Audio and Text")

    audio_submit.click(fn=process_audio_and_text, inputs=[audio_input, optional_text], outputs=[audio_transcription, output_insights])

demo.launch(debug=True)