## Data-preprocessing

### 1. Downloading the Youtube transcripts

In [2]:
import os
import pandas as pd
import yt_dlp
import json
from webvtt import WebVTT

In [None]:
#downloads youtube video subtitles only
def download_subtitles_only(playlist_url, max_videos=60, output_dir='transcripts'):
    os.makedirs(output_dir, exist_ok=True)

    ydl_opts = {
        'skip_download': True,  # Do not download video or audio
        'writesubtitles': True,  # Download uploaded subtitles
        'writeautomaticsub': True,  # Fallback to auto-generated if no uploaded subs
        'subtitleslangs': ['en'],  # English only
        'playlistend': max_videos,
        'outtmpl': f'{output_dir}/%(playlist_index)s - %(title)s.%(ext)s',
        'quiet': False,
        'ignoreerrors': True,
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([playlist_url])

    # After download, convert all .vtt files in output_dir to JSON and TXT
    for filename in os.listdir(output_dir):
        if filename.endswith(".vtt"):
            vtt_path = os.path.join(output_dir, filename)
            base_name = filename.rsplit('.', 1)[0]

            # Load .vtt captions
            captions = []
            for caption in WebVTT().read(vtt_path):
                captions.append({
                    "start": caption.start,  # string like "00:00:01.000"
                    "end": caption.end,
                    "text": caption.text.strip()
                })

            # Save JSON
            json_path = os.path.join(output_dir, base_name + ".json")
            with open(json_path, "w", encoding="utf-8") as jf:
                json.dump(captions, jf, indent=2, ensure_ascii=False)

            # Save plain text (concatenate all captions)
            txt_path = os.path.join(output_dir, base_name + ".txt")
            with open(txt_path, "w", encoding="utf-8") as tf:
                for caption in captions:
                    tf.write(caption["text"] + "\n")

            print(f"Converted {filename} to {json_path} and {txt_path}")

# Example usage
playlist_url = "https://www.youtube.com/playlist?list=PLnaXrumrax3X8_6L1yL3cejSMH9oTpxiI"
download_subtitles_only(playlist_url)


[youtube:tab] Extracting URL: https://www.youtube.com/playlist?list=PLnaXrumrax3X8_6L1yL3cejSMH9oTpxiI
[youtube:tab] PLnaXrumrax3X8_6L1yL3cejSMH9oTpxiI: Downloading webpage
[youtube:tab] PLnaXrumrax3X8_6L1yL3cejSMH9oTpxiI: Redownloading playlist API JSON with unavailable videos
[download] Downloading playlist: StarTalk Podcast
[youtube:tab] Playlist StarTalk Podcast: Downloading 60 items of 414
[download] Downloading item 1 of 60
[youtube] Extracting URL: https://www.youtube.com/watch?v=fplTT3IrxDw
[youtube] fplTT3IrxDw: Downloading webpage
[youtube] fplTT3IrxDw: Downloading tv client config
[youtube] fplTT3IrxDw: Downloading tv player API JSON
[youtube] fplTT3IrxDw: Downloading ios player API JSON
[youtube] fplTT3IrxDw: Downloading m3u8 information
[info] fplTT3IrxDw: Downloading subtitles: en
[info] fplTT3IrxDw: Downloading 1 format(s): 401+251
[info] Writing video subtitles to: transcripts/01 - Is The Universe Made of Tiny Vibrating Strings？ With Lara Anderson.en.vtt
[download] Dest

ERROR: [youtube] eEWLGBwukzQ: Video unavailable. This video has been removed by the uploader


[download] Downloading item 8 of 60
[youtube] Extracting URL: https://www.youtube.com/watch?v=Mo3A4MgFDuE
[youtube] Mo3A4MgFDuE: Downloading webpage
[youtube] Mo3A4MgFDuE: Downloading tv client config
[youtube] Mo3A4MgFDuE: Downloading tv player API JSON
[youtube] Mo3A4MgFDuE: Downloading ios player API JSON
[youtube] Mo3A4MgFDuE: Downloading m3u8 information
[info] Mo3A4MgFDuE: Downloading subtitles: en
[info] Mo3A4MgFDuE: Downloading 1 format(s): 401+251
[info] Writing video subtitles to: transcripts/08 - Fact-Checking Medical Claims on the Internet with Dr. Noc.en.vtt
[download] Destination: transcripts/08 - Fact-Checking Medical Claims on the Internet with Dr. Noc.en.vtt
[download] 100% of  676.54KiB in 00:00:00 at 2.25MiB/s
[download] Downloading item 9 of 60
[youtube] Extracting URL: https://www.youtube.com/watch?v=s7EMTpogKCQ
[youtube] s7EMTpogKCQ: Downloading webpage
[youtube] s7EMTpogKCQ: Downloading tv client config
[youtube] s7EMTpogKCQ: Downloading tv player API JSON
[youtu

### 2. Convert text to a Pandas DataFrame

In [None]:
def load_txt_transcripts_to_df(transcript_dir):
    all_rows = []

    for filename in os.listdir(transcript_dir):
        if filename.endswith('.txt'):
            filepath = os.path.join(transcript_dir, filename)
            with open(filepath, 'r', encoding='utf-8') as f:
                lines = f.readlines()
            # Clean lines: strip newline & spaces
            lines = [line.strip() for line in lines if line.strip() != '']

            # Append rows with source filename
            for line in lines:
                all_rows.append({
                    'text': line,
                    'source_file': filename
                })

    df = pd.DataFrame(all_rows)
    return df
#loads transcripts to directory transcripts
load_txt_transcripts_to_df('/Users/test/Desktop/ironhack_labs/YouTube_ChatBot_Final/datasets/transcripts')

Unnamed: 0,text,source_file
0,so planets become more interesting moons,40 - Neil deGrasse Tyson and Bill Nye Catch Up...
1,so planets become more interesting moons,40 - Neil deGrasse Tyson and Bill Nye Catch Up...
2,become places to go and revisit but,40 - Neil deGrasse Tyson and Bill Nye Catch Up...
3,become places to go and revisit but,40 - Neil deGrasse Tyson and Bill Nye Catch Up...
4,become places to go and revisit but,40 - Neil deGrasse Tyson and Bill Nye Catch Up...
...,...,...
230158,edition. Neil deGrasse Tyson here. As,"12 - Unpacking Einstein’s Greatest Papers, wit..."
230159,"always, I bid you to keep looking up.","12 - Unpacking Einstein’s Greatest Papers, wit..."
230160,"always, I bid you to keep looking up.","12 - Unpacking Einstein’s Greatest Papers, wit..."
230161,"always, I bid you to keep looking up.","12 - Unpacking Einstein’s Greatest Papers, wit..."


### 3. Remove immediate repetitions in the txt files

In [None]:
# #removes repetitions in the txt file, but only if the repetition immediately follows the original sentiment. 
# his way we do not lose this rephrasing later in a different context, and the context is still balanced

def clean_repeated_lines_df_and_save(df, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    
    # Process per source file
    for filename, group in df.groupby('source_file'):
        lines = group['text'].tolist()
        
        cleaned_lines = []
        prev_line = None
        
        for line in lines:
            line_stripped = line.strip()
            if line_stripped != prev_line:
                cleaned_lines.append(line_stripped)
                prev_line = line_stripped
            else:
                # Skip immediate repeated line
                continue
        
        # Save cleaned lines to new file in output_dir with same filename
        output_path = os.path.join(output_dir, filename)
        with open(output_path, 'w', encoding='utf-8') as f:
            for line in cleaned_lines:
                f.write(line + '\n')
        
        print(f"Cleaned text saved to: {output_path}")


# Paths
transcript_dir = '/Users/test/Desktop/ironhack_labs/YouTube_ChatBot_Final/datasets/transcripts'
output_clean_dir = '/Users/test/Desktop/ironhack_labs/YouTube_ChatBot_Final/datasets/cleaned_transcripts'

### 4. Preprocess the txts

In [None]:
def load_txt_files_to_dataframe(directory_path):
    all_rows = []

    for filename in os.listdir(directory_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(directory_path, filename)
            with open(file_path, 'r', encoding='utf-8') as f:
                lines = f.readlines()
            
            # Clean and store lines with metadata
            for line in lines:
                line = line.strip()
                if line:  # skip empty lines
                    all_rows.append({
                        "text": line,
                        "source_file": filename
                    })

    df = pd.DataFrame(all_rows)
    return df

# Usage
directory = "/Users/test/Desktop/ironhack_labs/YouTube_ChatBot_Final/datasets/cleaned_transcripts"
dataframe = load_txt_files_to_dataframe(directory)

# Preview the result
print(dataframe.head())


                                       text  \
0  so planets become more interesting moons   
1       become places to go and revisit but   
2     there was a whole other goal and that   
3       was the search for intelligent life   
4     still is in the universe oh man it is   

                                         source_file  
0  40 - Neil deGrasse Tyson and Bill Nye Catch Up...  
1  40 - Neil deGrasse Tyson and Bill Nye Catch Up...  
2  40 - Neil deGrasse Tyson and Bill Nye Catch Up...  
3  40 - Neil deGrasse Tyson and Bill Nye Catch Up...  
4  40 - Neil deGrasse Tyson and Bill Nye Catch Up...  


### Pickle the dataframe for further use

In [None]:
# Assuming `docs` is a list of LangChain Document objects
dataframe.to_pickle("/Users/test/Desktop/ironhack_labs/YouTube_ChatBot_Final/datasets/dataframe.pkl")