In [41]:
import re
import os
from fuzzywuzzy import fuzz
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def read_file(file_path):
    with open(file_path, 'r') as f:
        return f.read()

def write_file(file_path, content):
    with open(file_path, 'w') as f:
        f.write(content)

def find_tiktok_segments(youtube_transcript, tiktok_transcripts, threshold=0.65):
    tiktok_positions = []
    youtube_sentences = sent_tokenize(youtube_transcript)
    
    vectorizer = TfidfVectorizer()
    
    for tiktok in tiktok_transcripts:
        tiktok_sentences = sent_tokenize(tiktok)
        
        for i in range(len(youtube_sentences) - len(tiktok_sentences) + 1):
            segment = youtube_sentences[i:i + len(tiktok_sentences)]
            segment_str = ' '.join(segment)
            
            tfidf_matrix = vectorizer.fit_transform([segment_str, tiktok])
            similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
            
            if similarity >= threshold:
                start_pos = sum(len(s) for s in youtube_sentences[:i]) + i
                end_pos = start_pos + len(tiktok)
                tiktok_positions.append((start_pos, end_pos))
                break
                
    return tiktok_positions

def segment_and_label_transcript(youtube_transcript, tiktok_positions):
    tiktok_positions.sort()
    segments = []
    labels = []
    last_end = 0
    
    for start, end in tiktok_positions:
        if start > last_end:
            segments.append(youtube_transcript[last_end:start])
            labels.append("Non Tiktok - 0")
        
        segments.append(youtube_transcript[start:end])
        labels.append("Tiktok - 1")
        
        last_end = end
    
    if last_end < len(youtube_transcript):
        segments.append(youtube_transcript[last_end:])
        labels.append("Non Tiktok - 0")
    
    labeled_segments = [f"{label}:\n{segment}\n" for label, segment in zip(labels, segments)]
    
    return labeled_segments

# File paths
youtube_transcript_path = "/Users/rossjackson/Documents/VideoAnalysisProject/New_Project_Directory/TikTok:YT_Audios/#51/#51 Would you lie for fame_55716-10-08_Powered by notta.ai.txt"
tiktok_paths = [
    "/Users/rossjackson/Documents/VideoAnalysisProject/New_Project_Directory/TikTok:YT_Audios/#51/yt_51_1_55725-06-17_Powered by notta.ai.txt",
    "/Users/rossjackson/Documents/VideoAnalysisProject/New_Project_Directory/TikTok:YT_Audios/#51/yt_51_2_55725-06-17_Powered by notta.ai.txt",
    "/Users/rossjackson/Documents/VideoAnalysisProject/New_Project_Directory/TikTok:YT_Audios/#51/yt_51_3_55725-06-17_Powered by notta.ai.txt",
    "/Users/rossjackson/Documents/VideoAnalysisProject/New_Project_Directory/TikTok:YT_Audios/#51/yt_51_4_55725-06-17_Powered by notta.ai.txt"
]

# Read files
youtube_transcript = read_file(youtube_transcript_path)
tiktok_transcripts = read_file(tiktok_transcripts_path).split('\n')

# Find TikTok segments
tiktok_positions = find_tiktok_segments(youtube_transcript, tiktok_transcripts)

# Segment and label the YouTube transcript
labeled_segments = segment_and_label_transcript(youtube_transcript, tiktok_positions)

# Write to file
output_path = os.path.splitext(youtube_transcript_path)[0] + "_labeled.txt"
write_file(output_path, '\n'.join(labeled_segments))

print(f"Labeled segments have been written to {output_path}")


FileNotFoundError: [Errno 2] No such file or directory: '/path/to/your/tiktok_transcripts.txt'