In [None]:
import nltk
import re
import os
from pydub import AudioSegment
import whisper
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Ensure the NLTK packages are downloaded (only need to run once)
nltk.data.path.append('C:/Users/ssromerogon/Documents/vscode_working_dir/meeting_minutes_processing/')  # Add the path where nltk data is stored
nltk.download('punkt', download_dir='C:/Users/ssromerogon/Documents/vscode_working_dir/meeting_minutes_processing/')  # Redownload punkt
nltk.download('stopwords', download_dir='C:/Users/ssromerogon/Documents/vscode_working_dir/meeting_minutes_processing/')  # Redownload stopwords
nltk.download('punkt_tab', download_dir='C:/Users/ssromerogon/Documents/vscode_working_dir/meeting_minutes_processing/')

# 1. Convert MP3 to WAV using pydub
audio_file = "scdc_11_13_24.mp3"
wav_file = "scdc_11_13_24.wav"

# Convert MP3 to WAV
audio = AudioSegment.from_mp3(audio_file)
audio.export(wav_file, format="wav")

# 2. Speech-to-Text Conversion using Whisper
model = whisper.load_model("large")  # You can use "tiny", "small", or "large" based on your system
result = model.transcribe(audio_file)
text = result["text"]

# Check if the transcription was successful
if not text.strip():
    print("Whisper model could not transcribe the audio. The text is empty.")
else:
    try:
        # Clean the text (remove extra spaces, non-printable characters)
        text = re.sub(r'\s+', ' ', text).strip()  # Remove multiple spaces and leading/trailing spaces
        text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Remove non-ASCII characters

        # 3. Natural Language Processing (NLP)
        # Tokenization
        sentences = sent_tokenize(text)
        words = word_tokenize(text)

        # Stop Word Removal
        stop_words = set(stopwords.words('english'))
        filtered_words = [word for word in words if word.lower() not in stop_words]

        # Stemming
        stemmer = PorterStemmer()
        stemmed_words = [stemmer.stem(word) for word in filtered_words]

        # 4. Enhanced Meeting Minute Generation
        def generate_minutes(sentences):
            minutes = []
            
            # Define a set of keywords and phrases related to action items and decisions
            action_keywords = [
                "action item", "decision", "next steps", "plan", "task", "should do", 
                "must do", "follow up", "agenda", "to do", "next task", "experiment"
            ]
            
            decision_keywords = [
                "decide", "decision", "conclude", "agree", "determine", "should", "feel confident", 
                "ready to launch", "launch", "plan the big experiment", "recruiting"
            ]
            
            experiment_related_phrases = [
                "experiment", "preliminary", "differential expression", "differential variability", 
                "spatial domain", "biology is changing", "exciting ways", "cells", "cell type", 
                "annotation"
            ]
            
            collaboration_phrases = [
                "shed light", "help us", "ready to launch", "prepared", "recruiting", "launch into"
            ]
            
            for sentence in sentences:
                # Look for action items or decision-related sentences
                if any(keyword in sentence.lower() for keyword in action_keywords):
                    minutes.append(sentence)
                elif any(keyword in sentence.lower() for keyword in decision_keywords):
                    minutes.append(sentence)
                elif any(phrase in sentence.lower() for phrase in experiment_related_phrases):
                    minutes.append(sentence)
                elif any(phrase in sentence.lower() for phrase in collaboration_phrases):
                    minutes.append(sentence)
            return minutes

        # Generate meeting minutes
        meeting_minutes = generate_minutes(sentences)

        # 5. Save Raw Text and Meeting Minutes to Text Files
        raw_text_file = "raw_transcribed_text.txt"
        with open(raw_text_file, "w") as f:
            f.write(text)

        # Save Meeting Minutes
        output_file = "meeting_minutes.txt"
        with open(output_file, "w") as f:
            for line in meeting_minutes:
                f.write(line + "\n")

        print("Raw text and meeting minutes saved.")
        print(f"Raw text saved to {raw_text_file}")
        print(f"Meeting minutes saved to {output_file}")

    except Exception as e:
        print("Error during tokenization or NLP processing:", e)

[nltk_data] Downloading package punkt to C:/Users/ssromerogon/Document
[nltk_data]     s/vscode_working_dir/meeting_minutes_processing/...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:/Users/ssromerogon/Docu
[nltk_data]     ments/vscode_working_dir/meeting_minutes_processing/..
[nltk_data]     .
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:/Users/ssromerogon/Docu
[nltk_data]     ments/vscode_working_dir/meeting_minutes_processing/..
[nltk_data]     .
[nltk_data]   Package punkt_tab is already up-to-date!
100%|█████████████████████████████████████| 2.88G/2.88G [03:31<00:00, 14.6MiB/s]
  checkpoint = torch.load(fp, map_location=device)
