In [None]:
# Install FastAPI, Uvicorn (ASGI server), and pyngrok
!pip install fastapi uvicorn pyngrok python-multipart


Collecting uvicorn
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting pyngrok
  Downloading pyngrok-7.2.2-py3-none-any.whl.metadata (8.4 kB)
Collecting python-multipart
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Downloading uvicorn-0.34.0-py3-none-any.whl (62 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.3/62.3 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyngrok-7.2.2-py3-none-any.whl (22 kB)
Downloading python_multipart-0.0.20-py3-none-any.whl (24 kB)
Installing collected packages: uvicorn, python-multipart, pyngrok
Successfully installed pyngrok-7.2.2 python-multipart-0.0.20 uvicorn-0.34.0


In [None]:
!ngrok config add-authtoken 

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
from fastapi import FastAPI, File, UploadFile, Form
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from g2p_en import G2p
from rapidfuzz import fuzz
import torch
import librosa
import io
import nltk
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse
import ffmpeg
import tempfile
import os
from openai import OpenAI


nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
app = FastAPI()

app.add_middleware(
    CORSMiddleware,
    allow_origins=['*'],
    allow_credentials=True,
    allow_methods=['*'],
    allow_headers=['*'],
)

XAI_API_KEY = ""
client = OpenAI(
    api_key=XAI_API_KEY,
    base_url="https://api.x.ai/v1",
)


def convert_webm_to_wav(webm_blob: bytes) -> bytes:
    """
    Converts a WebM audio blob to a WAV audio blob.

    Args:
        webm_blob (bytes): Audio data in WebM format.

    Returns:
        bytes: Audio data in WAV format.
    """
    with tempfile.NamedTemporaryFile(suffix='.webm', delete=False) as input_file:
        input_file.write(webm_blob)
        input_file.close()  # Close the file explicitly so FFmpeg can access it

    try:
        with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as output_file:
            output_file.close()

            # Use FFmpeg to convert WebM to WAV
            (
                ffmpeg
                .input(input_file.name)
                .output(output_file.name, format='wav')
                .run(overwrite_output=True, quiet=True)
            )

            # Read the WAV blob from the output file
            with open(output_file.name, 'rb') as f:
                wav_blob = f.read()

            return wav_blob

    except ffmpeg.Error as e:
        print(f"FFmpeg error: {e.stderr}")
        raise RuntimeError("Failed to convert WebM to WAV") from e

    finally:
        # Cleanup temporary files
        os.unlink(input_file.name)
        os.unlink(output_file.name)




# Load the pre-trained Wav2Vec2 model and processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")

# Initialize G2p for text-to-phoneme conversion
g2p = G2p()

# Helper function to transcribe audio
def transcribe_audio(audio_data):
    # Load audio (WAV format expected)
    speech, rate = librosa.load(io.BytesIO(audio_data), sr=16000)
    input_values = processor(speech, sampling_rate=rate, return_tensors="pt", padding=True).input_values

    # Get predictions
    with torch.no_grad():
        logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)

    # Decode transcription
    transcription = processor.decode(predicted_ids[0])
    return transcription.lower()

# Helper function to convert text to phonemes
def text_to_phonemes(text):
    phonemes = g2p(text)
    return " ".join(phonemes)

# Helper function to calculate similarity
def phoneme_similarity(reference, user_output):
    return fuzz.ratio(reference, user_output) / 100.0  # Normalize to [0, 1]

# Generate feedback based on similarity score
def generate_feedback(reference_text, transcription, reference_phonemes, user_phonemes, similarity_score):
    # Prompt for GPT model
    prompt = f"""
    You are an AI language expert providing pronunciation feedback. Analyze the following data:

    - Reference Text: {reference_text}
    - Transcription: {transcription}
    - Reference Phonemes: {reference_phonemes}
    - User Phonemes: {user_phonemes}
    - Similarity Score: {similarity_score}

    Provide feedback in this format:
    1. Overall Feedback (based on similarity score)
    2. Specific Phoneme Feedback (highlight mismatched phonemes, if any, and give improvement suggestions)
    3. Encouragement (to motivate the user)
    """

    # Call OpenAI API
    completion = client.chat.completions.create(
        model="grok-2-1212",
        messages=[
            {"role": "system", "content": "You are an AI pronunciation coach."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=200,
        temperature=0.5
    )

    return completion.choices[0].message.content

# Endpoint to analyze pronunciation
@app.post("/analyze-pronunciation/")
async def analyze_pronunciation(file: UploadFile = File(...), reference_text: str = Form(...)):
    try:
        # Step 1: Read the uploaded audio file
        webm_blob = await file.read()

        # Step 2: Transcribe the audio
        audio_data = convert_webm_to_wav(webm_blob)
        transcription = transcribe_audio(audio_data)

        # Step 3: Convert reference and transcription to phonemes
        reference_phonemes = text_to_phonemes(reference_text)
        user_phonemes = text_to_phonemes(transcription)

        # Step 4: Compare phonemes and calculate similarity
        similarity = phoneme_similarity(reference_phonemes, user_phonemes)

        # Step 5: Generate feedback
        feedback = generate_feedback(similarity)

        # Step 6: Return the results
        return {
            "reference_text": reference_text,
            "transcription": transcription,
            "reference_phonemes": reference_phonemes,
            "user_phonemes": user_phonemes,
            "similarity_score": round(similarity * 100, 2),
            "feedback": generate_feedback(reference_text, transcription, reference_phonemes, user_phonemes, round(similarity * 100, 2))
        }

    except Exception as e:
        return {"error": str(e)}

# Index
@app.get("/")
def read_root():
    return JSONResponse(content={"message": "Hello from the Under World!"})

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package cmudict to /root/nltk_data...
[nltk_data]   Unzipping corpora/cmudict.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional 

preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Save the FastAPI server code into a file named server.py

code = '''
from fastapi import FastAPI, File, UploadFile, Form
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from g2p_en import G2p
from rapidfuzz import fuzz
import torch
import librosa
import io
import nltk
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse
import ffmpeg
import tempfile
import os
from openai import OpenAI


nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
app = FastAPI()

app.add_middleware(
    CORSMiddleware,
    allow_origins=['*'],
    allow_credentials=True,
    allow_methods=['*'],
    allow_headers=['*'],
)

XAI_API_KEY = ""
client = OpenAI(
    api_key=XAI_API_KEY,
    base_url="https://api.x.ai/v1",
)


def convert_webm_to_wav(webm_blob: bytes) -> bytes:
    """
    Converts a WebM audio blob to a WAV audio blob.

    Args:
        webm_blob (bytes): Audio data in WebM format.

    Returns:
        bytes: Audio data in WAV format.
    """
    with tempfile.NamedTemporaryFile(suffix='.webm', delete=False) as input_file:
        input_file.write(webm_blob)
        input_file.close()  # Close the file explicitly so FFmpeg can access it

    try:
        with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as output_file:
            output_file.close()

            # Use FFmpeg to convert WebM to WAV
            (
                ffmpeg
                .input(input_file.name)
                .output(output_file.name, format='wav')
                .run(overwrite_output=True, quiet=True)
            )

            # Read the WAV blob from the output file
            with open(output_file.name, 'rb') as f:
                wav_blob = f.read()

            return wav_blob

    except ffmpeg.Error as e:
        print(f"FFmpeg error: {e.stderr}")
        raise RuntimeError("Failed to convert WebM to WAV") from e

    finally:
        # Cleanup temporary files
        os.unlink(input_file.name)
        os.unlink(output_file.name)




# Load the pre-trained Wav2Vec2 model and processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")

# Initialize G2p for text-to-phoneme conversion
g2p = G2p()

# Helper function to transcribe audio
def transcribe_audio(audio_data):
    # Load audio (WAV format expected)
    speech, rate = librosa.load(io.BytesIO(audio_data), sr=16000)
    input_values = processor(speech, sampling_rate=rate, return_tensors="pt", padding=True).input_values

    # Get predictions
    with torch.no_grad():
        logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)

    # Decode transcription
    transcription = processor.decode(predicted_ids[0])
    return transcription.lower()

# Helper function to convert text to phonemes
def text_to_phonemes(text):
    phonemes = g2p(text)
    return " ".join(phonemes)

# Helper function to calculate similarity
def phoneme_similarity(reference, user_output):
    return fuzz.ratio(reference, user_output) / 100.0  # Normalize to [0, 1]

# Generate feedback based on similarity score
def generate_feedback(reference_text, transcription, reference_phonemes, user_phonemes, similarity_score):
    # Prompt for GPT model
    prompt = f"""
    You are an AI language expert providing pronunciation feedback. Analyze the following data:

    - Reference Text: {reference_text}
    - Transcription: {transcription}
    - Reference Phonemes: {reference_phonemes}
    - User Phonemes: {user_phonemes}
    - Similarity Score: {similarity_score}

    Provide feedback in this format:
    1. Overall Feedback (based on similarity score)
    2. Phoneme Feedback (give improvement suggestions)
    3. Encouragement (to motivate the user)
    """

    # Call OpenAI API
    completion = client.chat.completions.create(
        model="grok-2-1212",
        messages=[
            {"role": "system", "content": "You are an AI pronunciation coach."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=300,
        temperature=0.5
    )

    return completion.choices[0].message.content

# Endpoint to analyze pronunciation
@app.post("/analyze-pronunciation/")
async def analyze_pronunciation(file: UploadFile = File(...), reference_text: str = Form(...)):
    try:
        # Step 1: Read the uploaded audio file
        webm_blob = await file.read()

        # Step 2: Transcribe the audio
        audio_data = convert_webm_to_wav(webm_blob)
        transcription = transcribe_audio(audio_data)

        # Step 3: Convert reference and transcription to phonemes
        reference_phonemes = text_to_phonemes(reference_text)
        user_phonemes = text_to_phonemes(transcription)

        # Step 4: Compare phonemes and calculate similarity
        similarity = phoneme_similarity(reference_phonemes, user_phonemes)

        # Step 5: Generate feedback
        feedback = generate_feedback(reference_text, transcription, reference_phonemes, user_phonemes, round(similarity * 100, 2))

        # Step 6: Return the results
        return {
            "reference_text": reference_text,
            "transcription": transcription,
            "reference_phonemes": reference_phonemes,
            "user_phonemes": user_phonemes,
            "similarity_score": round(similarity * 100, 2),
            "feedback": feedback
        }

    except Exception as e:
        return {"error": str(e)}

# Index
@app.get("/")
def read_root():
    return JSONResponse(content={"message": "Hello from the Under World!"})
'''

with open("server.py", "w") as f:
    f.write(code)

In [None]:
from pyngrok import ngrok
import subprocess
import time

# Run FastAPI server in the background
server_process = subprocess.Popen(["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8000"])

# Give the server a few seconds to start
time.sleep(3)

# Create a public URL using ngrok
public_url = ngrok.connect(8000).public_url
print(f"🚀 FastAPI is running at: {public_url}")

🚀 FastAPI is running at: https://5c3e-34-150-224-60.ngrok-free.app


In [None]:
# Terminate the server process and disconnect ngrok
server_process.terminate()
ngrok.disconnect(public_url)


