In [None]:
# === Create music directory and upload background music ===
!mkdir -p music


In [None]:
# === Install Dependencies (Colab/Linux) ===
!apt-get update
!apt-get install -y imagemagick fontconfig ttf-mscorefonts-installer fonts-dejavu portaudio19-dev ffmpeg
!fc-cache -fv
!sed -i 's/rights=\"none\"/rights=\"read|write\"/g' /etc/ImageMagick*/policy.xml
!pip install TTS moviepy diffusers transformers accelerate scipy torchaudio gradio pydub

# === Imports ===
import os, gc, re, torch, requests, datetime, random
from transformers import CLIPTokenizer
from diffusers import StableDiffusionXLPipeline
from moviepy.editor import *
from pydub import AudioSegment
from TTS.api import TTS
import gradio as gr

GROQ_API_KEY = "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"



# === Load Tokenizer ===
clip_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")


0% [Working]            Hit:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
0% [Connecting to archive.ubuntu.com (185.125.190.81)] [Connecting to security.                                                                               Hit:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
0% [Connecting to archive.ubuntu.com (185.125.190.81)] [Connecting to security.0% [Waiting for headers] [Waiting for headers] [Waiting for headers] [Connected                                                                               Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:4 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.la

In [None]:
# === Load Models ===
device = "cuda" if torch.cuda.is_available() else "cpu"
pipe = StableDiffusionXLPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16).to(device)
tts_model = TTS("tts_models/multilingual/multi-dataset/your_tts", progress_bar=False).to(device)

# === Memory Management ===
def free_cuda():
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()
    gc.collect()

# === Helpers ===
def clean_output(text):
    return text.encode("utf-16", "surrogatepass").decode("utf-16", "ignore")

def fallback_story(topic):
    title = f"Why {topic.capitalize()} Matters"
    story = f"{topic.capitalize()} is shaping the world. Let's explore how it's transforming our lives and future."
    return title, story

def clip_truncate(prompt, max_tokens=77):
    tokens = clip_tokenizer.tokenize(prompt)
    return clip_tokenizer.convert_tokens_to_string(tokens[:max_tokens])

def clean_text_for_tts(text):
    text = re.sub(r'\*\*+', '', text)
    text = re.sub(r'\[.*?\]\(.*?\)', '', text)
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'https?://\S+', '', text)
    text = text.replace("*", "")
    lines = [line.strip() for line in text.split('\n') if line.strip()]
    merged = []
    buffer = ""
    for line in lines:
        if len(line.split()) < 5:
            buffer += " " + line
        else:
            if buffer:
                merged.append(buffer.strip())
                buffer = ""
            merged.append(line)
    if buffer:
        merged.append(buffer.strip())
    return " ".join(merged)

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

 > tts_models/multilingual/multi-dataset/your_tts is already downloaded.
 > Using model: vits
 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
 > Model fully restored. 
 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:64
 | > log_func:np.log10
 | > min_level_db:-

In [None]:
def convert_to_wav_if_needed(file_path):
    try:
        ext = os.path.splitext(file_path)[1].lower()
        if ext != ".wav":
            audio = AudioSegment.from_file(file_path)
            wav_path = file_path.rsplit(".", 1)[0] + ".wav"
            audio.export(wav_path, format="wav")
            print(f"[🎵] Converted to WAV: {wav_path}")
            return wav_path
        return file_path
    except Exception as e:
        print(f"[⚠️] Failed to convert voice sample to WAV: {e}")
        return None

def generate_tts_coqui(text, out_path_wav, speaker_wav=None):
    print("[🔊] Generating TTS using Coqui...")
    try:
        cleaned = clean_text_for_tts(text[:2000])
        if speaker_wav:
            speaker_wav = convert_to_wav_if_needed(speaker_wav)
            if not speaker_wav or not os.path.isfile(speaker_wav):
                print("[⚠️] Skipping custom voice due to invalid or failed conversion.")
                speaker_wav = None
        tts_model.tts_to_file(text=cleaned, speaker_wav=speaker_wav, language="en", file_path=out_path_wav)
        print("[✅] TTS audio saved at:", out_path_wav)
    except Exception as e:
        raise RuntimeError(f"[❌] Coqui TTS generation failed: {e}")


In [None]:
def generate_story_llama3(topic, word_count=900):
    prompt = f"You are a skilled LinkedIn copywriter creating a social media post (~{word_count} words) on the topic: '{topic}'.\n\nRespond in the following format only:\nTitle: <insert title>\nStory: <insert story here>"
    try:
        res = requests.post(
            "https://api.groq.com/openai/v1/chat/completions",
            json={"model": "llama3-70b-8192", "messages": [{"role": "user", "content": prompt}]},
            headers={"Authorization": f"Bearer {GROQ_API_KEY}"}
        )
        output = res.json()['choices'][0]['message']['content']
        if "Title:" in output and "Story:" in output:
            title = output.split("Title:")[1].split("\n")[0].strip()
            story = output.split("Story:")[1].strip()
            os.makedirs("post", exist_ok=True)
            with open("post/story.txt", "w", encoding="utf-8") as f:
                f.write(f"Title: {title}\n\nStory: {story}")
            return title, story
        else:
            return fallback_story(topic)
    except Exception as e:
        print(f"[Groq Error] {e}")
        return fallback_story(topic)

In [None]:
def generate_image_prompt_from_topic(topic):
    prompt = f"""
You are an expert AI prompt engineer. Given the topic: '{topic}', create a highly detailed, visually rich prompt suitable for Stable Diffusion XL.

Ensure the prompt is vivid, descriptive, and highly specific with elements like lighting, background setting, mood, color tone, camera type, focus depth, and physical characteristics.
Use phrases like: ultra-detailed, cinematic, masterpiece, photorealistic, 8k, epic composition, trending on ArtStation.
Avoid generic or vague words. Output only the prompt.
"""
    try:
        res = requests.post(
            "https://api.groq.com/openai/v1/chat/completions",
            json={
                "model": "llama3-70b-8192",
                "messages": [{"role": "user", "content": prompt}]
            },
            headers={"Authorization": f"Bearer {GROQ_API_KEY}"}
        )
        return clip_tokenizer.convert_tokens_to_string(clip_tokenizer.tokenize(res.json()['choices'][0]['message']['content'])[:77])
    except:
        return clip_tokenizer.convert_tokens_to_string(clip_tokenizer.tokenize(f"photorealistic image of {topic}, ultra sharp, 8k, cinematic lighting")[:77])


def ken_burns_zoom_with_captions(image_path, duration, top_text="", zoom=1.1):
    background = ColorClip(size=(1080, 1920), color=(0, 0, 0), duration=duration)
    image_clip = ImageClip(image_path).resize(height=1440).set_duration(duration)

    # Use scale effect instead of cropping to zoom the image itself
    zoomed = image_clip.resize(lambda t: zoom ** (t / duration)).set_position((0, 240))

    caption = TextClip(top_text, fontsize=48, color="white", font="DejaVu-Serif-Bold",
                       method='caption', align='center', size=(1000, 200))
    caption = caption.set_position(("center", 40)).set_duration(duration)
    return CompositeVideoClip([background, zoomed, caption])

In [None]:
def process_pipeline(topic, output_type, word_count, voice_sample, bgm_file, bgm_volume):
    os.makedirs("post", exist_ok=True)
    title, story = generate_story_llama3(topic, word_count)
    prompt = generate_image_prompt_from_topic(topic)
    image_paths = []

    if output_type in ["Image", "Text Image Video", "Video"]:
        for i in range(5):
            try:
                img = pipe(prompt=prompt, guidance_scale=8.0, generator=torch.manual_seed(i)).images[0]
                img = img.resize((1080, 1080))
                path = f"post/image_{i}.png"
                img.save(path)
                image_paths.append(path)
                print(f"[🖼️] Saved image: {path}")
                free_cuda()
            except Exception as img_err:
                print(f"[⚠️] Image {i} generation failed: {img_err}")
                continue

    out_path = None
    if output_type == "Text Image Video":
        if not image_paths:
            raise RuntimeError("❌ No images were generated successfully. Cannot proceed with video.")

        if voice_sample:
            print("[📂] Uploaded voice sample path:", voice_sample)

        generate_tts_coqui(story, out_path_wav="post/audio.wav", speaker_wav=voice_sample if voice_sample else None)

        audio_clip = AudioFileClip("post/audio.wav")
        duration_per_image = audio_clip.duration / max(len(image_paths), 1)
        clips = [ken_burns_zoom_with_captions(img, duration=duration_per_image, top_text=title)
                 for img in image_paths]

        if bgm_file:
            bgm_path = convert_to_wav_if_needed(bgm_file)
            if bgm_path and os.path.isfile(bgm_path):
                bg_music_clip = AudioFileClip(bgm_path)
                loops_required = int(audio_clip.duration // bg_music_clip.duration) + 1
                looped_music = concatenate_audioclips([bg_music_clip] * loops_required).subclip(0, audio_clip.duration)
                looped_music = looped_music.volumex(bgm_volume)
                mixed_audio = CompositeAudioClip([audio_clip, looped_music])
            else:
                mixed_audio = audio_clip
        else:
            mixed_audio = audio_clip

        final_clip = concatenate_videoclips(clips).set_audio(mixed_audio)
        out_path = f"post/final_video_{datetime.datetime.now().strftime('%Y%m%d%H%M%S')}.mp4"
        final_clip.write_videofile(out_path, fps=24, codec="libx264", audio_codec="aac")
        print(f"[✅] Final video saved: {out_path}")

    result = f"""📌 **Topic**: {topic}\n🎬 **Output Type**: {output_type}\n\n📝 **Title**: {title}\n📖 **Story**: {story}"""
    return result, image_paths, out_path


In [None]:
# === Gradio Custom CSS and Background ===
custom_css = """
#title h2 {
  color: #ff4b4b !important;
  font-size: 28px !important;
  font-weight: 800;
}
.gr-button {
  background-color: #ff4b4b !important;
  color: white !important;
  font-weight: bold !important;
  border-radius: 8px;
}
.gr-button:hover {
  background-color: #e43e3e !important;
}
.gr-box, .gr-textbox, .gr-dropdown, .gr-image, .gr-accordion, .gr-file {
  background-color: rgba(0, 0, 0, 0.6) !important;
  border-radius: 12px;
  color: white !important;
}
html, body, .gradio-container {
  background-image: url('https://i.ibb.co/8WG4fZS/vibrant-colors-flow-abstract-wave-pattern-generated-by-ai.jpg');
  background-size: cover;
  background-position: center;
  background-attachment: fixed;
  font-family: 'Segoe UI', sans-serif;
  color: white;
}
"""


# === Gradio Interface ===
def run_gradio():
    with gr.Blocks(css=custom_css) as demo:
        gr.Markdown("## 🎬 AI Social Media Post Generator (Open Source Voice Clone)", elem_id="title")
        with gr.Row():
            with gr.Column():
                topic = gr.Textbox(label="📌 Topic", placeholder="e.g., AI in Indian Education")
                output_type = gr.Radio(choices=["Text", "Image", "Video", "Text Image Video"], label="🎞️ Output Type")
                word_count = gr.Slider(minimum=500, maximum=1500, value=900, step=100, label="✍️ Word Count")
                voice_sample = gr.File(label="🗣 Upload Voice Sample (.wav or .mp3)", type="filepath", file_types=[".wav", ".mp3", ".m4a", ".ogg"])
                bgm_file = gr.File(label="🎵 Upload Background Music (Optional)", type="filepath", file_types=[".wav", ".mp3", ".m4a", ".ogg"])
                bgm_volume = gr.Slider(minimum=0.0, maximum=1.0, value=0.2, step=0.05, label="🔊 BGM Volume", interactive=True)
                generate_btn = gr.Button("🚀 Generate Post")
            with gr.Column():
                image_preview = gr.Gallery(label="🖼️ Preview", columns=3, rows=2, object_fit="contain", height=400)
                result = gr.Textbox(label="📜 Generated Summary", lines=14)
                download = gr.File(label="⬇️ Download Video")

        generate_btn.click(fn=process_pipeline,
                           inputs=[topic, output_type, word_count, voice_sample, bgm_file, bgm_volume],
                           outputs=[result, image_preview, download])

    demo.launch(debug=True, share=True)

if __name__ == "__main__":
    run_gradio()


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://83ed03720c257de462.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Token indices sequence length is longer than the specified maximum sequence length for this model (79 > 77). Running this sequence through the model will result in indexing errors
The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['y skin']
Token indices sequence length is longer than the specified maximum sequence length for this model (79 > 77). Running this sequence through the model will result in indexing errors
The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['y skin']


  0%|          | 0/50 [00:00<?, ?it/s]

[🖼️] Saved image: post/image_0.png


The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['y skin']
The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['y skin']


  0%|          | 0/50 [00:00<?, ?it/s]

[🖼️] Saved image: post/image_1.png


The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['y skin']
The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['y skin']


  0%|          | 0/50 [00:00<?, ?it/s]

[🖼️] Saved image: post/image_2.png


The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['y skin']
The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['y skin']


  0%|          | 0/50 [00:00<?, ?it/s]

[🖼️] Saved image: post/image_3.png


The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['y skin']
The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['y skin']


  0%|          | 0/50 [00:00<?, ?it/s]

[🖼️] Saved image: post/image_4.png
[📂] Uploaded voice sample path: /tmp/gradio/1bdf60ebf5932a8688326b4b19f27980080898f201d434c80f64ae463602c2ae/AUDIO_CONTENT_GENRATOR.wav
[🔊] Generating TTS using Coqui...
 > Text splitted to sentences.
['As I slithered through the dense forests of Western Ghats, the rustling of leaves beneath my feet was a gentle reminder of the ancient secrets hidden within.', 'The thrill of adventure and the whisper of mystery had led me to this enchanted land, where the revered and the reviled coexisted in harmony.', 'I was on a quest to unravel the mystique of Snakes in India – creatures that have fascinated and frightened humans for centuries.', 'India, a land of incredible biodiversity, is home to over 300 species of snakes, each with its unique characteristics, habits, and habitats.', "From the majestic Cobras and majestic Kraits to the venomous Saw-scaled Vipers and Russell's Vipers, the diversity is staggering.", 'Yet, despite their importance in the ecosystem



MoviePy - Done.
Moviepy - Writing video post/final_video_20250714084134.mp4





Moviepy - Done !
Moviepy - video ready post/final_video_20250714084134.mp4
[✅] Final video saved: post/final_video_20250714084134.mp4


The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['0 -']
The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['0 -']


  0%|          | 0/50 [00:00<?, ?it/s]

[🖼️] Saved image: post/image_0.png


The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['0 -']
The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['0 -']


  0%|          | 0/50 [00:00<?, ?it/s]

[🖼️] Saved image: post/image_1.png


The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['0 -']
The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['0 -']


  0%|          | 0/50 [00:00<?, ?it/s]

[🖼️] Saved image: post/image_2.png


The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['0 -']
The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['0 -']


  0%|          | 0/50 [00:00<?, ?it/s]

[🖼️] Saved image: post/image_3.png


The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['0 -']
The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['0 -']


  0%|          | 0/50 [00:00<?, ?it/s]

[🖼️] Saved image: post/image_4.png
