# Optimized Speech-to-Text Model with NLP Tasks
This notebook implements a speech-to-text model using Wav2Vec2 and applies NLP tasks such as sentiment analysis, classification, and summarization while optimizing RAM usage.

In [None]:
!pip install torch torchaudio transformers datasets librosa nltk textblob scikit-learn

Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting 

In [None]:
import torch
import librosa
import numpy as np
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from textblob import TextBlob
from transformers import pipeline

In [None]:
processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
model = Wav2Vec2ForCTC.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")

# Convert model to half precision to reduce RAM usage
model.half()
model.eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/262 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/300 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

Wav2Vec2ForCTC(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (1-4): 4 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projec

In [None]:
# Define your file path (REPLACE with actual path)
audio_path = "/content/sample_data/M4 Macbook Air Review： Too Easy!.mp3"

# Load the audio
speech, sr = librosa.load(audio_path, sr=16000)

chunk_duration = 30  # Increase chunk size
total_duration = len(speech) / sr
transcriptions = []

for start in range(0, int(total_duration), chunk_duration):
    end = min(start + chunk_duration, total_duration)
    chunk = speech[int(start * sr): int(end * sr)]

    input_values = processor(chunk, return_tensors="pt", sampling_rate=16000).input_values.half()

    with torch.no_grad():
        logits = model(input_values).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    transcriptions.append(processor.batch_decode(predicted_ids)[0])

    print(f"Processed {start}-{end} seconds")

# Join all transcriptions
transcription = " ".join(transcriptions)
print("Final Transcription:", transcription)


Processed 0-30 seconds


In [None]:

sentiment = TextBlob(transcription).sentiment.polarity
sentiment_label = "Positive" if sentiment > 0 else "Negative" if sentiment < 0 else "Neutral"
print(f"Sentiment: {sentiment_label} (Score: {sentiment})")


In [None]:

categories = {
    "Technology": ["AI", "machine learning", "computer", "software"],
    "Automotive": ["car", "engine", "vehicle", "Tesla"],
    "Sports": ["football", "basketball", "tennis"],
    "Entertainment": ["movie", "music", "show"]
}

def classify_text(text):
    text = text.lower()
    for category, keywords in categories.items():
        if any(keyword in text for keyword in keywords):
            return category
    return "Unknown"

classification = classify_text(transcription)
print(f"Classification: {classification}")


In [None]:

summarizer = pipeline("summarization")
summary = summarizer(transcription, max_length=50, min_length=10, do_sample=False)[0]['summary_text']
print("Summary:", summary)


In [None]:

with open("results.txt", "w") as f:
    f.write(f"Transcription:\n{transcription}\n\n")
    f.write(f"Sentiment: {sentiment_label} (Score: {sentiment})\n")
    f.write(f"Classification: {classification}\n")
    f.write(f"Summary:\n{summary}\n")

print("Results saved to results.txt")
