In [1]:
!pip install moviepy openai-whisper torch numpy

Collecting openai-whisper
  Downloading openai-whisper-20240930.tar.gz (800 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tiktoken (from openai-whisper)
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting triton>=2.0.0 (from openai-whisper)
  Downloading triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.3 kB)
Downloading triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (209.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.5/209.5 MB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K 

Transcription generator


In [2]:
import moviepy.editor as mp
import whisper
import json
from pathlib import Path
import torch
import numpy as np

class VideoTranscriber:
    def __init__(self, model_size="base"):
        """
        Initialize the transcriber with specified Whisper model size.
        model_size options: "tiny", "base", "small", "medium", "large"
        """
        self.model = whisper.load_model(model_size)

    def extract_audio(self, video_path, output_path=None):
        """
        Extract audio from video file and save it as WAV.
        Returns the path to the extracted audio file.
        """
        if output_path is None:
            output_path = Path(video_path).with_suffix('.wav')

        video = mp.VideoFileClip(video_path)
        video.audio.write_audiofile(str(output_path))
        video.close()

        return str(output_path)

    def transcribe_audio(self, audio_path):
        """
        Transcribe audio file with word-level timestamps.
        Returns a dictionary with transcription data.
        """
        # Load audio and get transcription
        result = self.model.transcribe(
            audio_path,
            language="en",
            word_timestamps=True,
            verbose=False
        )

        # Process the results into a more structured format
        transcription_data = {
            "segments": []
        }

        for segment in result["segments"]:
            segment_data = {
                "start": segment["start"],
                "end": segment["end"],
                "text": segment["text"].strip(),
                "words": []
            }

            # Add word-level data if available
            if "words" in segment:
                for word in segment["words"]:
                    word_data = {
                        "word": word["word"].strip(),
                        "start": word["start"],
                        "end": word["end"],
                        "confidence": word.get("confidence", 1.0)
                    }
                    segment_data["words"].append(word_data)

            transcription_data["segments"].append(segment_data)

        return transcription_data

    def process_video(self, video_path, output_json=None):
        """
        Process video file: extract audio and transcribe.
        Saves transcription to JSON file if output_json is specified.
        """
        # Extract audio
        print("Extracting audio from video...")
        audio_path = self.extract_audio(video_path)

        # Transcribe
        print("Transcribing audio...")
        transcription = self.transcribe_audio(audio_path)

        # Save to JSON if output path specified
        if output_json:
            with open(output_json, 'w', encoding='utf-8') as f:
                json.dump(transcription, f, indent=2, ensure_ascii=False)
            print(f"Transcription saved to {output_json}")

        # Clean up extracted audio file
        Path(audio_path).unlink()

        return transcription

def main():
    # Example usage
    video_path = "/content/audio2.mp4"
    output_json = "transcription.json"

    # Initialize transcriber with "base" model
    transcriber = VideoTranscriber(model_size="base")

    # Process video and get transcription
    transcription = transcriber.process_video(video_path, output_json)

    # Print first few segments as example
    print("\nFirst few segments of transcription:")
    for segment in transcription["segments"][:2]:
        print(f"\nTimestamp: {segment['start']:.2f}s - {segment['end']:.2f}s")
        print(f"Text: {segment['text']}")
        if segment['words']:
            print("Words with timestamps:")
            for word in segment['words'][:3]:
                print(f"  {word['word']}: {word['start']:.2f}s - {word['end']:.2f}s")

if __name__ == "__main__":
    main()

  if event.key is 'enter':

100%|███████████████████████████████████████| 139M/139M [00:02<00:00, 58.1MiB/s]
  checkpoint = torch.load(fp, map_location=device)



Extracting audio from video...
MoviePy - Writing audio in /content/audio2.wav




MoviePy - Done.
Transcribing audio...


100%|██████████| 12446/12446 [00:11<00:00, 1089.23frames/s]

Transcription saved to transcription.json

First few segments of transcription:

Timestamp: 0.00s - 4.26s
Text: It really means to fuck, but normally we use it as the exclamation
Words with timestamps:
  It: 0.00s - 0.14s
  really: 0.14s - 0.36s
  means: 0.36s - 0.72s

Timestamp: 4.26s - 6.00s
Text: FAC! Joder!
Words with timestamps:
  FAC!: 4.26s - 4.98s
  Joder!: 5.74s - 6.00s





In [2]:
!pip install better-profanity

Collecting better-profanity
  Downloading better_profanity-0.7.0-py3-none-any.whl.metadata (7.1 kB)
Downloading better_profanity-0.7.0-py3-none-any.whl (46 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/46.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.1/46.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: better-profanity
Successfully installed better-profanity-0.7.0


Profane Detector


In [5]:
import json
from better_profanity import profanity
from pathlib import Path

class ProfanityAnalyzer:
    def __init__(self):
        """
        Initialize the profanity analyzer with better-profanity
        """
        profanity.load_censor_words()

    def analyze_transcription(self, json_path):
        """
        Analyze transcription JSON file for individual profane words
        Returns a list of detected profane words with their exact timestamps
        """
        # Read the transcription JSON
        with open(json_path, 'r', encoding='utf-8') as f:
            transcription = json.load(f)

        profanity_instances = []

        # Analyze each segment and word
        for segment in transcription["segments"]:
            if "words" not in segment:
                continue

            # Check individual words only
            for word_data in segment["words"]:
                word = word_data["word"].strip().lower()  # Convert to lowercase for better matching
                if profanity.contains_profanity(word):
                    instance = {
                        "word": word_data["word"],  # Keep original case in output
                        "start_time": word_data["start"],
                        "end_time": word_data["end"],
                        "confidence": word_data.get("confidence", 1.0)
                    }
                    profanity_instances.append(instance)

        # Sort by start time
        profanity_instances.sort(key=lambda x: x["start_time"])
        return profanity_instances

    def save_report(self, profanity_instances, output_path):
        """
        Save the profanity analysis results to a JSON file
        """
        report = {
            "total_instances": len(profanity_instances),
            "profane_words": profanity_instances
        }

        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(report, f, indent=2, ensure_ascii=False)

    def print_timeline(self, profanity_instances):
        """
        Print a simple timeline of profanity instances
        """
        print("\nProfanity Timeline:")
        for instance in profanity_instances:
            print(
                f"Word: {instance['word']} | "
                f"Time: {instance['start_time']:.2f}s - {instance['end_time']:.2f}s"
            )

def main():
    # Example usage
    transcription_path = "transcription.json"
    output_path = "profanity_timestamps.json"

    # Initialize analyzer
    analyzer = ProfanityAnalyzer()

    # Analyze transcription
    print("Analyzing transcription for profanity...")
    profanity_instances = analyzer.analyze_transcription(transcription_path)

    # Save detailed report
    analyzer.save_report(profanity_instances, output_path)
    print(f"\nProfanity timestamps saved to {output_path}")

    # Print timeline
    analyzer.print_timeline(profanity_instances)
    print(f"\nTotal profane words found: {len(profanity_instances)}")

if __name__ == "__main__":
    main()

Analyzing transcription for profanity...

Profanity timestamps saved to profanity_timestamps.json

Profanity Timeline:
Word: fuck, | Time: 1.28s - 1.62s
Word: Damn, | Time: 10.04s - 10.74s
Word: shit. | Time: 45.76s - 46.40s
Word: hell. | Time: 63.08s - 63.42s
Word: shit. | Time: 68.86s - 69.26s
Word: Damn, | Time: 81.42s - 81.58s
Word: pussy. | Time: 86.84s - 87.84s
Word: jerk, | Time: 114.24s - 114.40s
Word: asshole. | Time: 114.94s - 115.56s
Word: pissed | Time: 121.82s - 122.14s

Total profane words found: 10


Final Code


In [3]:
!pip install moviepy openai-whisper better-profanity torch numpy



In [4]:
!pip install pydub

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [6]:
import moviepy.editor as mp
import whisper
import json
from pathlib import Path
import torch
from better_profanity import profanity
from typing import List, Dict
from pydub import AudioSegment
import tempfile
import os

class VideoProfanityMuter:
    def __init__(self, model_size="base"):
        """
        Initialize the system with Whisper model and profanity checker
        """
        self.model = whisper.load_model(model_size)
        profanity.load_censor_words()

    def process_video(self, input_video_path: str, output_video_path: str = None):
        """
        Main processing pipeline
        """
        if output_video_path is None:
            output_video_path = str(Path(input_video_path).with_stem(Path(input_video_path).stem + "_clean"))

        print("Step 1: Transcribing video...")
        transcription = self._transcribe_video(input_video_path)

        print("Step 2: Detecting profanity...")
        profanity_instances = self._detect_profanity(transcription)

        if not profanity_instances:
            print("No profanity detected in the video.")
            return input_video_path

        print("Step 3: Muting profane sections...")
        self._create_clean_video(input_video_path, output_video_path, profanity_instances)

        print(f"\nProcessing complete! Clean video saved to: {output_video_path}")
        return output_video_path

    def _transcribe_video(self, video_path: str) -> dict:
        """
        Extract audio and transcribe with word-level timestamps
        """
        try:
            # Create temporary directory for intermediate files
            with tempfile.TemporaryDirectory() as temp_dir:
                # Extract audio using moviepy
                video = mp.VideoFileClip(video_path)
                if video.audio is None:
                    raise ValueError("Video has no audio track")

                temp_audio_path = os.path.join(temp_dir, "temp_audio.wav")
                video.audio.write_audiofile(temp_audio_path, verbose=False, logger=None)
                video.close()

                # Transcribe
                result = self.model.transcribe(
                    temp_audio_path,
                    language="en",
                    word_timestamps=True,
                    verbose=False
                )

                # Save transcription to JSON
                transcription_path = str(Path(video_path).with_suffix('.json'))
                with open(transcription_path, 'w') as f:
                    json.dump(result, f, indent=2)

                return result

        except Exception as e:
            print(f"Error during transcription: {str(e)}")
            raise

    def _detect_profanity(self, transcription: dict) -> List[Dict]:
        """
        Detect profanity in transcription with exact timestamps
        """
        profanity_instances = []

        if isinstance(transcription, str):
            with open(transcription, 'r') as f:
                transcription = json.load(f)

        for segment in transcription["segments"]:
            if "words" not in segment:
                continue

            for word in segment["words"]:
                word_text = word["word"].strip().lower()
                if profanity.contains_profanity(word_text):
                    instance = {
                        "word": word["word"],
                        "start_time": int(word["start"] * 1000),  # Convert to milliseconds
                        "end_time": int(word["end"] * 1000)
                    }
                    profanity_instances.append(instance)

        return sorted(profanity_instances, key=lambda x: x["start_time"])

    def _create_clean_video(self, input_path: str, output_path: str, profanity_instances: List[Dict]):
        """
        Create new video with muted profane sections using pydub
        """
        try:
            with tempfile.TemporaryDirectory() as temp_dir:
                # Extract audio using moviepy
                video = mp.VideoFileClip(input_path)
                temp_audio_path = os.path.join(temp_dir, "temp_audio.wav")
                video.audio.write_audiofile(temp_audio_path, verbose=False, logger=None)

                # Load audio with pydub
                audio = AudioSegment.from_wav(temp_audio_path)

                # Create silent segment for muting
                silent_segment = AudioSegment.silent(duration=100)  # 100ms silence

                # Process each profanity instance
                for instance in profanity_instances:
                    start_time = max(0, instance["start_time"] - 100)  # 100ms buffer
                    end_time = min(len(audio), instance["end_time"] + 100)

                    # Calculate duration and create appropriate silence
                    duration = end_time - start_time
                    mute_segment = AudioSegment.silent(duration=duration)

                    # Replace the segment with silence
                    audio = audio[:start_time] + mute_segment + audio[end_time:]

                # Export processed audio
                temp_processed_audio = os.path.join(temp_dir, "processed_audio.wav")
                audio.export(temp_processed_audio, format="wav")

                # Load processed audio back into video
                processed_audio = mp.AudioFileClip(temp_processed_audio)
                final_video = video.set_audio(processed_audio)

                # Write final video
                final_video.write_videofile(
                    output_path,
                    codec='libx264',
                    audio_codec='aac',
                    temp_audiofile=os.path.join(temp_dir, 'temp-final-audio.m4a'),
                    remove_temp=True,
                    verbose=False,
                    logger=None
                )

                # Cleanup
                video.close()
                final_video.close()
                processed_audio.close()

        except Exception as e:
            print(f"Error during video processing: {str(e)}")
            raise

def main():
    try:
        input_video = "/content/video_1.mp4"
        output_video = "/content/video_1_clean.mp4"

        muter = VideoProfanityMuter(model_size="base")
        muter.process_video(input_video, output_video)

    except Exception as e:
        print(f"Error in main: {str(e)}")

if __name__ == "__main__":
    main()

  checkpoint = torch.load(fp, map_location=device)




Step 1: Transcribing video...


100%|██████████| 1054/1054 [00:29<00:00, 35.71frames/s]

Step 2: Detecting profanity...
No profanity detected in the video.



