In [3]:
pip install TTS



In [7]:
!pip install flask torch torchaudio TTS pyngrok flask-cors


Collecting pyngrok
  Downloading pyngrok-7.2.3-py3-none-any.whl.metadata (8.7 kB)
Collecting flask-cors
  Downloading flask_cors-5.0.1-py3-none-any.whl.metadata (961 bytes)
Downloading pyngrok-7.2.3-py3-none-any.whl (23 kB)
Downloading flask_cors-5.0.1-py3-none-any.whl (11 kB)
Installing collected packages: pyngrok, flask-cors
Successfully installed flask-cors-5.0.1 pyngrok-7.2.3


In [None]:
import os
import torch
import torchaudio
from flask import Flask, request, send_file, jsonify
from flask_cors import CORS
from TTS.api import TTS
from pyngrok import ngrok

# Create directories for uploads and outputs
UPLOAD_FOLDER = "./uploads"
OUTPUT_FOLDER = "./outputs"
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

app = Flask(__name__)
CORS(app)  # Enable CORS to allow frontend requests from different origins

app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
app.config['OUTPUT_FOLDER'] = OUTPUT_FOLDER

# Set your ngrok auth token here
ngrok.set_auth_token("2t5jG4Kb85xObYFT6mzyG088vu1_61sQEm5o3GesZbmChuQHR")
public_url = ngrok.connect(5000)
print(f" * ngrok tunnel \"{public_url}\" -> \"http://127.0.0.1:5000\"")

# Load TTS model and move to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2").to(device)
print("TTS model loaded on", device)

@app.route('/')
def home():
    return jsonify({"message": "Backend is running."})

@app.route('/process', methods=['POST'])
def process():
    # Validate file upload
    if 'voice_sample' not in request.files:
        return jsonify({"error": "No voice sample provided."}), 400
    file = request.files['voice_sample']
    if file.filename == '':
        return jsonify({"error": "No file selected."}), 400

    # Validate text input
    text = request.form.get('text')
    if not text:
        return jsonify({"error": "Text input is required."}), 400

    # Save the uploaded voice sample
    input_filepath = os.path.join(app.config['UPLOAD_FOLDER'], file.filename)
    file.save(input_filepath)

    # Load the audio file and resample if needed
    try:
        waveform, sample_rate = torchaudio.load(input_filepath)
    except Exception as e:
        return jsonify({"error": f"Error loading audio file: {str(e)}"}), 400

    if sample_rate != 22050:
        try:
            resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=22050)
            waveform = resampler(waveform)
        except Exception as e:
            return jsonify({"error": f"Error during resampling: {str(e)}"}), 400

    # Save the cleaned voice file for TTS processing
    cleaned_filename = "cleaned_" + file.filename
    cleaned_filepath = os.path.join(app.config['UPLOAD_FOLDER'], cleaned_filename)
    torchaudio.save(cleaned_filepath, waveform, 22050)

    # Define the output file path
    output_filepath = os.path.join(app.config['OUTPUT_FOLDER'], "output_speech.wav")

    try:
        # Generate speech using the TTS model with the provided voice sample
        tts.tts_to_file(text=text, speaker_wav=cleaned_filepath, language="en", file_path=output_filepath)
    except Exception as e:
        return jsonify({"error": f"Error generating speech: {str(e)}"}), 500

    # Return the generated file as an attachment
    return send_file(output_filepath, as_attachment=True)

if __name__ == '__main__':
    app.run(port=5000)


 * ngrok tunnel "NgrokTunnel: "https://9717-35-197-49-174.ngrok-free.app" -> "http://localhost:5000"" -> "http://127.0.0.1:5000"
 > tts_models/multilingual/multi-dataset/xtts_v2 is already downloaded.
 > Using model: xtts




TTS model loaded on cuda
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [07/Mar/2025 04:39:05] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [07/Mar/2025 04:39:06] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [07/Mar/2025 04:40:26] "[31m[1mPOST / HTTP/1.1[0m" 405 -
INFO:werkzeug:127.0.0.1 - - [07/Mar/2025 04:41:01] "[31m[1mPOST / HTTP/1.1[0m" 405 -
INFO:werkzeug:127.0.0.1 - - [07/Mar/2025 04:42:59] "[31m[1mPOST / HTTP/1.1[0m" 405 -


 > Text splitted to sentences.
['hello there i am priyanshu']
 > Text splitted to sentences.
['hello there i am priyanshu']
 > Text splitted to sentences.
['hello there i am priyanshu']
 > Text splitted to sentences.
['hello there i am priyanshu']
 > Text splitted to sentences.
['hello there i am priyanshu']
 > Text splitted to sentences.
['hello there i am priyanshu']
 > Text splitted to sentences.
['hello there i am priyanshu']
 > Text splitted to sentences.
['hello there i am priyanshu']
 > Text splitted to sentences.
['hello there i am priyanshu']
 > Text splitted to sentences.
['hello there i am priyanshu']


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
INFO:werkzeug:127.0.0.1 - - [07/Mar/2025 04:50:47] "POST /process HTTP/1.1" 200 -


 > Processing time: 16.554147005081177
 > Real-time factor: 6.450009567819478


INFO:werkzeug:127.0.0.1 - - [07/Mar/2025 04:50:47] "POST /process HTTP/1.1" 200 -


 > Processing time: 16.92993712425232
 > Real-time factor: 6.450530715886156


INFO:werkzeug:127.0.0.1 - - [07/Mar/2025 04:50:49] "POST /process HTTP/1.1" 200 -


 > Processing time: 19.120213270187378
 > Real-time factor: 6.431545988034411


INFO:werkzeug:127.0.0.1 - - [07/Mar/2025 04:50:50] "POST /process HTTP/1.1" 200 -


 > Processing time: 19.08771562576294
 > Real-time factor: 6.20260742672826


INFO:werkzeug:127.0.0.1 - - [07/Mar/2025 04:50:50] "POST /process HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [07/Mar/2025 04:50:50] "POST /process HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [07/Mar/2025 04:50:50] "POST /process HTTP/1.1" 200 -


 > Processing time: 20.140294790267944
 > Real-time factor: 6.150198040735212
 > Processing time: 20.003557443618774
 > Real-time factor: 6.002047159152433
 > Processing time: 20.50180673599243
 > Real-time factor: 6.067007173725482


INFO:werkzeug:127.0.0.1 - - [07/Mar/2025 04:50:51] "POST /process HTTP/1.1" 200 -


 > Processing time: 20.40723466873169
 > Real-time factor: 5.9571532043731965


INFO:werkzeug:127.0.0.1 - - [07/Mar/2025 04:50:51] "POST /process HTTP/1.1" 200 -


 > Processing time: 21.20509958267212
 > Real-time factor: 5.760551520277945


INFO:werkzeug:127.0.0.1 - - [07/Mar/2025 04:50:51] "POST /process HTTP/1.1" 200 -


 > Processing time: 22.55424737930298
 > Real-time factor: 5.321447041534312
 > Text splitted to sentences.
['hello there i am priyanshu soni this is an ai generated voice for content creation.']
 > Text splitted to sentences.
['hello there i am priyanshu soni this is an ai generated voice for content creation.']


INFO:werkzeug:127.0.0.1 - - [07/Mar/2025 04:51:48] "POST /process HTTP/1.1" 200 -


 > Processing time: 7.608701467514038
 > Real-time factor: 0.9594420084104477


INFO:werkzeug:127.0.0.1 - - [07/Mar/2025 04:51:48] "POST /process HTTP/1.1" 200 -


 > Processing time: 7.766062021255493
 > Real-time factor: 0.9555043498832896
 > Text splitted to sentences.
['hello there i am priyanshu soni this is an ai generated voice for content creation.']


INFO:werkzeug:127.0.0.1 - - [07/Mar/2025 04:52:53] "POST /process HTTP/1.1" 200 -


 > Processing time: 8.885574340820312
 > Real-time factor: 0.4340966407332054
