In [5]:
import re
import math
from nltk.tokenize import sent_tokenize, word_tokenize
from fuzzywuzzy import fuzz
import spacy

nlp = spacy.load("en_core_web_sm")

# Function to read transcript from a file
def read_transcript(file_path):
    with open(file_path, 'r') as f:
        return f.read()

# Function to check for matching sentences or consecutive words using fuzzy matching
def check_match(segment, clip, min_words=8, fuzz_ratio=90):
    for sentence in clip:
        if len(word_tokenize(sentence)) >= min_words:
            for seg_sentence in segment:
                if fuzz.ratio(sentence, seg_sentence) >= fuzz_ratio:
                    return True, [sentence]
    return False, []

# Function to calculate segment_length based on tokens
def calculate_segment_length(sentences):
    token_count = 0
    for i, sentence in enumerate(sentences):
        token_count += len(word_tokenize(sentence))
        if 300 <= token_count <= 500:
            return i + 1
    return len(sentences)

# Function to label segments with dynamic length and overlap
def label_segments(youtube_file, tiktok_files, overlap=5):
    # Read YouTube transcript
    youtube_transcript = read_transcript(youtube_file)
    youtube_sentences = sent_tokenize(youtube_transcript)
    
    # Calculate segment_length based on tokens
    segment_length = calculate_segment_length(youtube_sentences)
    
    # Segment YouTube transcript with overlap
    segments = [youtube_sentences[i:i + segment_length] for i in range(0, len(youtube_sentences), segment_length - overlap)]
    
    # Read TikTok transcripts
    tiktok_clips = []
    for file in tiktok_files:
        tiktok_transcript = read_transcript(file)
        tiktok_sentences = sent_tokenize(tiktok_transcript)
        tiktok_clips.append(tiktok_sentences)
    
    # Label segments
    labels = []
    for segment in segments:
        label = 0
        matching_sentences = []
        for clip in tiktok_clips:
            match, match_sentences = check_match(segment, clip)
            if match:
                label = 1
                matching_sentences.extend(match_sentences)
                break
        labels.append(label)
        
        # Print the matching sentences if label is 1
        if label == 1:
            print(f"Matching sentences: {matching_sentences}")
    
    # Output labeled segments
    #labeled_segments = list(zip(segments, labels))
    #return labeled_segments


# File paths
youtube_file = "/Users/rossjackson/Documents/VideoAnalysisProject/New_Project_Directory/TikTok:YT_Audios/#24/#24 Trust ｜ Cheating Partners, Death, Knife crime and being let down.txt"
tiktok_files = [
    "/Users/rossjackson/Documents/VideoAnalysisProject/New_Project_Directory/TikTok:YT_Audios/#24/yt_24_1.txt",
    "/Users/rossjackson/Documents/VideoAnalysisProject/New_Project_Directory/TikTok:YT_Audios/#24/yt_24_2.txt",
    "/Users/rossjackson/Documents/VideoAnalysisProject/New_Project_Directory/TikTok:YT_Audios/#24/yt_24_3.txt",
    "/Users/rossjackson/Documents/VideoAnalysisProject/New_Project_Directory/TikTok:YT_Audios/#24/yt_24_4.txt",
    "/Users/rossjackson/Documents/VideoAnalysisProject/New_Project_Directory/TikTok:YT_Audios/#24/yt_24_5.txt"
]

# Label the segments
labeled_segments = label_segments(youtube_file, tiktok_files)

# Print out some labeled segments for verification
#for i, (segment, label) in enumerate(labeled_segments):
    #print(f"Segment {i+1} (Label: {label}):\n{' '.join(segment)}\n")


Matching sentences: ["That's what we kind of put death to the back of our Minds."]
Matching sentences: ["No,  You didn't say it like that."]
Matching sentences: ['Is  She always making your sandwich with not enough ketchup in it?']
Matching sentences: ["Yeah, I'd love a sandwich."]
Matching sentences: ['You you want a barbecue sauce on it?']
Matching sentences: ["No, I wouldn't put anything past them."]
Matching sentences: ["No, I wouldn't put anything past them."]
Matching sentences: ["I'd be like, what the hell's going on?"]
Matching sentences: ["Are  You telling me I'm lying?"]
Matching sentences: ["He's like, yeah, sorry."]
Matching sentences: ["I'd be like, did you not see the size of it?"]
Matching sentences: ["That's what we kind of put death to the back of our Minds."]
Matching sentences: ['How long do I get spent on TikTok?']
Matching sentences: ["There's no need to post it on TikTok so it randomly pops up on my phone."]
