In [41]:
import re
import os
from fuzzywuzzy import fuzz
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def read_file(file_path):
    with open(file_path, 'r') as f:
        return f.read()

def write_file(file_path, content):
    with open(file_path, 'w') as f:
        f.write(content)

def find_tiktok_segments(youtube_transcript, tiktok_transcripts, threshold=0.65):
    tiktok_positions = []
    youtube_sentences = sent_tokenize(youtube_transcript)
    
    vectorizer = TfidfVectorizer()
    
    for tiktok in tiktok_transcripts:
        tiktok_sentences = sent_tokenize(tiktok)
        
        for i in range(len(youtube_sentences) - len(tiktok_sentences) + 1):
            segment = youtube_sentences[i:i + len(tiktok_sentences)]
            segment_str = ' '.join(segment)
            
            tfidf_matrix = vectorizer.fit_transform([segment_str, tiktok])
            similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
            
            if similarity >= threshold:
                start_pos = sum(len(s) for s in youtube_sentences[:i]) + i
                end_pos = start_pos + len(tiktok)
                tiktok_positions.append((start_pos, end_pos))
                break
                
    return tiktok_positions

def segment_and_label_transcript(youtube_transcript, tiktok_positions):
    tiktok_positions.sort()
    segments = []
    labels = []
    last_end = 0
    
    for start, end in tiktok_positions:
        if start > last_end:
            segments.append(youtube_transcript[last_end:start])
            labels.append("Non Tiktok - 0")
        
        segments.append(youtube_transcript[start:end])
        labels.append("Tiktok - 1")
        
        last_end = end
    
    if last_end < len(youtube_transcript):
        segments.append(youtube_transcript[last_end:])
        labels.append("Non Tiktok - 0")
    
    labeled_segments = [f"{label}:\n{segment}\n" for label, segment in zip(labels, segments)]
    
    return labeled_segments

# File paths
youtube_transcript_path = "/Users/rossjackson/Documents/VideoAnalysisProject/New_Project_Directory/TikTok:YT_Audios/#51/#51 Would you lie for fame_55716-10-08_Powered by notta.ai.txt"
tiktok_paths = [
    "/Users/rossjackson/Documents/VideoAnalysisProject/New_Project_Directory/TikTok:YT_Audios/#51/yt_51_1_55725-06-17_Powered by notta.ai.txt",
    "/Users/rossjackson/Documents/VideoAnalysisProject/New_Project_Directory/TikTok:YT_Audios/#51/yt_51_2_55725-06-17_Powered by notta.ai.txt",
    "/Users/rossjackson/Documents/VideoAnalysisProject/New_Project_Directory/TikTok:YT_Audios/#51/yt_51_3_55725-06-17_Powered by notta.ai.txt",
    "/Users/rossjackson/Documents/VideoAnalysisProject/New_Project_Directory/TikTok:YT_Audios/#51/yt_51_4_55725-06-17_Powered by notta.ai.txt"
]

# Read files
youtube_transcript = read_file(youtube_transcript_path)
tiktok_transcripts = read_file(tiktok_transcripts_path).split('\n')

# Find TikTok segments
tiktok_positions = find_tiktok_segments(youtube_transcript, tiktok_transcripts)

# Segment and label the YouTube transcript
labeled_segments = segment_and_label_transcript(youtube_transcript, tiktok_positions)

# Write to file
output_path = os.path.splitext(youtube_transcript_path)[0] + "_labeled.txt"
write_file(output_path, '\n'.join(labeled_segments))

print(f"Labeled segments have been written to {output_path}")


FileNotFoundError: [Errno 2] No such file or directory: '/path/to/your/tiktok_transcripts.txt'

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Function to read file
def read_file(file_path):
    with open(file_path, 'r') as f:
        return f.read()

# Paths to your TikTok and YouTube transcripts
tiktok_path = "/Users/rossjackson/Documents/VideoAnalysisProject/New_Project_Directory/TikTok:YT_Audios/#24/yt_24_2_55725-05-31_Powered by notta.ai.txt"
youtube_path = "/Users/rossjackson/Documents/VideoAnalysisProject/New_Project_Directory/TikTok:YT_Audios/#24/#24 Trust ｜ Cheating Partners, Death, Knife crime and being let down [ZNuYTm4G6RM]_55716-10-19_Powered by notta.ai.txt"

# Read the transcripts
tiktok_transcript = read_file(tiktok_path)
youtube_transcript = read_file(youtube_path)

# Get the length of the TikTok transcript in terms of words
tiktok_length = len(tiktok_transcript.split())

# Split the YouTube transcript into individual words
youtube_words = youtube_transcript.split()

from nltk.tokenize import sent_tokenize

# ... (previous code remains the same)

# Get the last sentence of the TikTok transcript
last_sentence_tiktok = sent_tokenize(tiktok_transcript)[-1]

# Initialize variables to keep track of the most similar ending sentence
max_sentence_similarity = 0
most_similar_sentence = ""

# Find the starting index of the most similar segment in the YouTube transcript
if most_similar_segment:
    start_index = youtube_transcript.find(most_similar_segment)

    # Take a larger portion of the YouTube transcript starting from this index
    extended_segment = youtube_transcript[start_index:start_index + len(most_similar_segment) * 2]

    # Tokenize the extended segment into sentences
    segment_sentences = sent_tokenize(extended_segment)

    # Vectorize the last sentence of the TikTok transcript and the sentences in the extended segment
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([last_sentence_tiktok] + segment_sentences)

    # Calculate Cosine Similarity for sentences
    sentence_similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()

    # Find the most similar ending sentence
    most_similar_sentence_index = sentence_similarities.argmax()
    most_similar_sentence = segment_sentences[most_similar_sentence_index]

    # Create the final segment starting from the first word and ending at the most similar sentence
    final_segment_start = most_similar_segment.find(first_word_tiktok)
    final_segment_end = extended_segment.find(most_similar_sentence) + len(most_similar_sentence)
    final_segment = extended_segment[final_segment_start:final_segment_end]

    print("Final matched segment from YouTube transcript:", final_segment)
else:
    print("No segment found with similarity above the threshold.")


# ... (previous code remains the same)

if most_similar_segment:
    print(f"Most similar segment: {most_similar_segment}")  # Debugging line
    start_index = youtube_transcript.find(most_similar_segment)
    print(f"Start index: {start_index}")  # Debugging line
    
    if start_index != -1:
        extended_segment = youtube_transcript[start_index:start_index + len(most_similar_segment) * 2]
        print(f"Extended segment: {extended_segment}")  # Debugging line

        # ... (rest of the code)



ValueError: Found array with 0 sample(s) (shape=(0, 2)) while a minimum of 1 is required by check_pairwise_arrays.

In [26]:
if most_similar_segment:
    print(f"Most similar segment: {most_similar_segment}")  # Debugging line
    start_index = youtube_transcript.find(most_similar_segment)
    print(f"Start index: {start_index}")  # Debugging line
    
    if start_index != -1:
        extended_segment = youtube_transcript[start_index:start_index + len(most_similar_segment) * 2]
        print(f"Extended segment: {extended_segment}")  # Debugging line

        # ... (rest of the code)

Most similar segment: Could you stay with someone if they cheated on you? The pure convenience. I just can't be asked what's breaking up. But then I'd be like, then you would, then it would be like spring me a sandwich with the one cent of me thing of ketchup. And I'd be like, hey! Yeah, they're always in your debt now. They're always in your debt. Everything, everything. What's the, what's the good thing? You be nitpicking. They'll bring you right about. I want it brown sauce! I want it brown sauce! You did it, you did it. Are you telling me I'm lying? Are you telling me I'm lying? Yeah. Yeah, remember your antics from a year ago. Remember! Well, 15 years ago, that time you sent me that man! Now I'm getting ketchup on my brown sauce! You're a filthy whore! You're a filthy whore! Ah! Ah! Oh! Oh God, I can't say that. No, I fucking, no. If you cheated on me, that's what, that's, that'd be a name! Yeah, yeah. I'd be happy to have been like, I'd be coming. You never forget it. No, of cour

In [27]:
# Trim extra whitespaces
most_similar_segment = " ".join(most_similar_segment.split())
youtube_transcript = " ".join(youtube_transcript.split())

# Then try to find the index again
start_index = youtube_transcript.find(most_similar_segment)
print(f"Start index after trimming: {start_index}")


Start index after trimming: 34297


In [28]:
# ... (previous code remains the same)

if start_index != -1:
    extended_segment = youtube_transcript[start_index:start_index + len(most_similar_segment) * 2]
    print(f"Extended segment: {extended_segment[:100]}...")  # Debugging line, showing first 100 characters

    # Tokenize the extended segment into sentences
    segment_sentences = sent_tokenize(extended_segment)

    # Vectorize the last sentence of the TikTok transcript and the sentences in the extended segment
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([last_sentence_tiktok] + segment_sentences)

    # Calculate Cosine Similarity for sentences
    sentence_similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()

    # Find the most similar ending sentence
    most_similar_sentence_index = sentence_similarities.argmax()
    most_similar_sentence = segment_sentences[most_similar_sentence_index]

    # Create the final segment starting from the first word and ending at the most similar sentence
    final_segment_start = most_similar_segment.find(first_word_tiktok)
    final_segment_end = extended_segment.find(most_similar_sentence) + len(most_similar_sentence)
    final_segment = extended_segment[final_segment_start:final_segment_end]

    print("Final matched segment from YouTube transcript:", final_segment)
else:
    print("No segment found with similarity above the threshold.")


Extended segment: Could you stay with someone if they cheated on you? The pure convenience. I just can't be asked what...
Final matched segment from YouTube transcript: Could you stay with someone if they cheated on you? The pure convenience. I just can't be asked what's breaking up. But then I'd be like, then you would, then it would be like spring me a sandwich with the one cent of me thing of ketchup. And I'd be like, hey! Yeah, they're always in your debt now. They're always in your debt. Everything, everything. What's the, what's the good thing? You be nitpicking. They'll bring you right about. I want it brown sauce! I want it brown sauce! You did it, you did it. Are you telling me I'm lying? Are you telling me I'm lying? Yeah. Yeah, remember your antics from a year ago. Remember! Well, 15 years ago, that time you sent me that man! Now I'm getting ketchup on my brown sauce! You're a filthy whore! You're a filthy whore! Ah! Ah! Oh! Oh God, I can't say that. No, I fucking, no. If yo