In [18]:
import re
import os
from fuzzywuzzy import fuzz
from nltk.tokenize import sent_tokenize, word_tokenize


def read_file(file_path):
    with open(file_path, 'r') as f:
        return f.read()

def write_file(file_path, content):
    with open(file_path, 'w') as f:
        f.write(content)


def find_tiktok_segments(youtube_transcript, tiktok_transcripts, threshold=90):
    tiktok_positions = []
    for tiktok in tiktok_transcripts:
        max_similarity = 0
        best_match = (0, 0)
        for i in range(len(youtube_transcript) - len(tiktok) + 1):
            segment = youtube_transcript[i:i + len(tiktok)]
            similarity = fuzz.ratio(segment, tiktok)
            if similarity > max_similarity:
                max_similarity = similarity
                best_match = (i, i + len(tiktok))
        if max_similarity >= threshold:
            tiktok_positions.append(best_match)
            print(f"Found TikTok segment at positions {best_match} with similarity {max_similarity}")
        else:
            print(f"Did not find TikTok segment: {tiktok[:30]}... in YouTube transcript.")
    return tiktok_positions


def custom_segment_transcript(youtube_transcript, tiktok_positions):
    segments = []
    last_end = 0
    for start, end in tiktok_positions:
        if start > last_end:
            segments.append(youtube_transcript[last_end:start])
        segments.append(youtube_transcript[start:end])
        last_end = end
    if last_end < len(youtube_transcript):
        segments.append(youtube_transcript[last_end:])
    return segments

def label_segments(segments, tiktok_positions):
    labels = []
    current_pos = 0
    labeled_segments = []
    for segment in segments:
        segment_start = current_pos
        segment_end = current_pos + len(segment)
        label = "Not TikTok-worthy"
        for start, end in tiktok_positions:
            if segment_start == start and segment_end == end:
                label = "TikTok-worthy"
                break
        labeled_segments.append(f"{label}:\n{segment}\n")
        current_pos += len(segment)
    return "".join(labeled_segments)

# File paths
youtube_transcript_path = "/Users/rossjackson/Documents/VideoAnalysisProject/New_Project_Directory/TikTok:YT_Audios/#37/#37 The Mission _ Suicide letter, Angry Jamaican man and a Dagger up the A_se-CD6TlG1s4yk_55716-10-18_Powered by notta.ai.txt"
tiktok_paths = [
    "/Users/rossjackson/Documents/VideoAnalysisProject/New_Project_Directory/TikTok:YT_Audios/#37/yt_37_1_55725-06-13_Powered by notta.ai.txt",
    "/Users/rossjackson/Documents/VideoAnalysisProject/New_Project_Directory/TikTok:YT_Audios/#37/yt_37_2_55725-06-13_Powered by notta.ai.txt",
    "/Users/rossjackson/Documents/VideoAnalysisProject/New_Project_Directory/TikTok:YT_Audios/#37/yt_37_3_55725-06-13_Powered by notta.ai.txt",
    "/Users/rossjackson/Documents/VideoAnalysisProject/New_Project_Directory/TikTok:YT_Audios/#37/yt_37_4_55725-06-13_Powered by notta.ai.txt"
]

# Read files
youtube_transcript = read_file(youtube_transcript_path)
tiktok_transcripts = [read_file(path) for path in tiktok_paths]

# Find, segment, and label
tiktok_positions = find_tiktok_segments(youtube_transcript, tiktok_transcripts)
segments = custom_segment_transcript(youtube_transcript, tiktok_positions)
labeled_segments = label_segments(segments, tiktok_positions)

# Write to file
output_path = os.path.splitext(youtube_transcript_path)[0] + "_labeled.txt"
labeled_content = label_segments(segments, tiktok_positions)
write_file(output_path, labeled_content)

print(f"Labeled segments have been written to {output_path}")


KeyboardInterrupt: 