In [21]:
import os
import pandas as pd
import google
from google.oauth2.credentials import Credentials
from googleapiclient.discovery import build
from bs4 import BeautifulSoup
from datetime import datetime
from yt_dlp import YoutubeDL
import re

In [22]:
YoutubeDL?

[0;31mInit signature:[0m [0mYoutubeDL[0m[0;34m([0m[0mparams[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mauto_init[0m[0;34m=[0m[0;32mTrue[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
YoutubeDL class.

YoutubeDL objects are the ones responsible of downloading the
actual video file and writing it to disk if the user has requested
it, among some other tasks. In most cases there should be one per
program. As, given a video URL, the downloader doesn't know how to
extract all the needed information, task that InfoExtractors do, it
has to pass the URL to one of them.

For this, YoutubeDL objects have a method that allows
InfoExtractors to be registered in a given order. When it is passed
a URL, the YoutubeDL object handles it to the first InfoExtractor it
finds that reports being able to handle it. The InfoExtractor extracts
all the information about the video or videos the URL refers to, and
YoutubeDL process the extracted information, possibly using a Fi

In [6]:
def parse_watch_history(html_file):
    if not os.path.exists(html_file):
        raise FileNotFoundError(f"Cannot find {html_file}")

    with open(html_file, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'lxml')

    # Find all watch history entries
    entries = soup.find_all('div', class_='content-cell mdl-cell mdl-cell--6-col mdl-typography--body-1')

    data = []
    for entry in entries:
        # Extract the link and title
        link = div.find('a', href=re.compile(r'https://www.youtube.com/watch\?v=.+'))
        if link:
            meta = get_metadata(link['href'])


        # Extract the timestamp
        timestamp_tag = entry.find('span')
        if timestamp_tag:
            timestamp_str = timestamp_tag.get_text(strip=True)
            # Parse the timestamp string to a datetime object
            try:
                # Example format: 'Mon, 01 Jan 2023 12:34:56 GMT'
                timestamp = datetime.strptime(timestamp_str, '%a, %d %b %Y %H:%M:%S GMT')
            except ValueError:
                timestamp = None
        else:
            timestamp = None

        # Extract video ID from URL
        # video_id = extract_video_id(url) if url else None

        data.append({
            'title': title,
            'url': url,
            # 'video_id': video_id,
            'time': timestamp
        })

    # Convert to DataFrame
    df = pd.DataFrame(data)
    # Drop entries without video_id or timestamp
    # df = df.dropna(subset=['video_id', 'time']).reset_index(drop=True)
    return df

In [7]:
# Path to your watch-history.html file
WATCH_HISTORY_FILE = '/Users/saisandeep/GitRepo/musical-eureka/YoutubeDataAnalysis/Takeout/YouTube and YouTube Music/history/watch-history.html'
temp=parse_watch_history(WATCH_HISTORY_FILE)

In [8]:
temp

Unnamed: 0,title,url,time
0,https://music.youtube.com/watch?v=JKlYOUfviXM,https://music.youtube.com/watch?v=JKlYOUfviXM,
1,https://music.youtube.com/watch?v=K1FlAphL2p8,https://music.youtube.com/watch?v=K1FlAphL2p8,
2,https://music.youtube.com/watch?v=u3iR6FP2RpU,https://music.youtube.com/watch?v=u3iR6FP2RpU,
3,https://music.youtube.com/watch?v=W5LIWrArBuA,https://music.youtube.com/watch?v=W5LIWrArBuA,
4,https://music.youtube.com/watch?v=H3Kzh6RrnMc,https://music.youtube.com/watch?v=H3Kzh6RrnMc,
...,...,...,...
50695,STRANGE LIGHTS Caught on CAMERA 😱 | Wholesome ...,https://www.youtube.com/watch?v=F5o2H3hNERA,
50696,Her dog put his paw prints in the concrete at ...,https://www.youtube.com/watch?v=N4SM5tLDv3Q,
50697,✅ Best and ❌ Worst way to take a Nap 😴 #nap #s...,https://www.youtube.com/watch?v=ydVy6-zNVkY,
50698,What are the two stages of memory formation?,https://www.youtube.com/watch?v=f-podo-f8Ak,


In [10]:
html_file = WATCH_HISTORY_FILE
if not os.path.exists(html_file):
    raise FileNotFoundError(f"Cannot find {html_file}")

with open(html_file, 'r', encoding='utf-8') as file:
    soup = BeautifulSoup(file, 'lxml')

# Find all watch history entries
entries = soup.find_all('div', class_='content-cell mdl-cell mdl-cell--6-col mdl-typography--body-1')

In [14]:
entries = soup.find_all('class')

In [16]:
j =0
for i in entries:
    print(i)
    if j>=20:
        break
    j+=1

In [23]:
import sys
import csv
import os
import re
from bs4 import BeautifulSoup
from yt_dlp import YoutubeDL
from datetime import datetime
from colorama import Fore, Style

def get_metadata(url):
    ydl_opts = {
        'quiet': True,
        'no_warnings': True,
        'skip_download': True,
        'forceurl': True,
        'forcetitle': True,
        'forcedescription': True,
        'writeinfojson': True,
        'simulate': True,
        'youtube_include_dash_manifest': False
    }

    with YoutubeDL(ydl_opts) as ydl:
        try:
            meta = ydl.extract_info(url, download=False)
            return meta
        except Exception:
            print(f"{Fore.RED}Failed to get metadata for {url}{Style.RESET_ALL}")
            return None

def parse_html(input_file, output_file, resume=False):
    with open(input_file, "r", encoding = "utf-8") as f:
        contents = f.read()

    soup = BeautifulSoup(contents, 'lxml')
    divs = soup.find_all('div', {'class': 'content-cell mdl-cell mdl-cell--6-col mdl-typography--body-1'})

    if resume:
        last_processed_index = get_last_processed_index(output_file)
        if last_processed_index is None:
            print(f"{Fore.YELLOW}Unable to determine the last processed index. Resuming from the beginning.{Style.RESET_ALL}")
            last_processed_index = 0
        else:
            last_processed_index += 1
        divs = divs[last_processed_index:]

    # Open CSV writer in append mode
    with open(output_file, 'a', newline='') as file:
        writer = csv.writer(file)

        try:
            for index, div in enumerate(divs, last_processed_index):
                link = div.find('a', href=re.compile(r'https://www.youtube.com/watch\?v=.+'))
                if link:
                    print(f"{Fore.GREEN}Parsing: {link['href']}{Style.RESET_ALL}")
                    meta = get_metadata(link['href'])

                    text_list = div.text.split('\n')
                    timestamp_text = None
                    for text in text_list:
                        if "WIB" in text:
                            timestamp_text = text.split("WIB")[0].strip().replace(',', '').replace('\xa0', ' ')
                            break

                    if timestamp_text:
                        try:
                            dt_obj = datetime.strptime(timestamp_text, '%b %d %Y %I:%M:%S %p')
                            timestamp = dt_obj.strftime('%Y-%m-%d %H:%M:%S')
                        except Exception as e:
                            timestamp = ''
                            print(link,e)
                            continue

                        # Get the album, artist, and track details
                        artist = ""
                        track = ""
                        album = ""
                        duration = ""
                        if meta:
                            if 'artist' in meta:
                                artist = meta['artist']
                            if 'track' in meta:
                                track = meta['track']
                            if 'album' in meta:
                                album = meta['album']
                            if 'duration' in meta:
                                duration = str(meta['duration'])

                        # Skip the div if artist or track is empty
                        if not artist or not track:
                            print(f"{Fore.YELLOW}Skipping empty artist or track{Style.RESET_ALL}")
                            continue

                        row = [artist, track, album, timestamp, artist, duration]
                        writer.writerow(row)
                        print(row)

                # Save the index of the last processed div
                save_last_processed_index(output_file, index)

        except KeyboardInterrupt:
            print(f"{Fore.RED}Parsing interrupted by user.{Style.RESET_ALL}")

    print(f"{Fore.GREEN}Parsing complete.{Style.RESET_ALL}")

def get_last_processed_index(output_file):
    progress_file = f"{output_file}.progress"
    if os.path.exists(progress_file):
        with open(progress_file, 'r') as file:
            last_index = file.read()
            if last_index.isdigit():
                return int(last_index)
    return None

def save_last_processed_index(output_file, index):
    progress_file = f"{output_file}.progress"
    with open(progress_file, 'w') as file:
        file.write(str(index))

def main(input_file='/Users/saisandeep/GitRepo/musical-eureka/YoutubeDataAnalysis/Takeout/YouTube and YouTube Music/history/tidy-watch-history.html', output_file='/Users/saisandeep/GitRepo/musical-eureka/YoutubeDataAnalysis/Takeout/YouTube and YouTube Music/history/temp.csv'):
    parse_html(input_file, output_file, resume=True)


if __name__ == "__main__":
    # main(sys.argv[1], sys.argv[2])
    main()

[33mUnable to determine the last processed index. Resuming from the beginning.[0m
[32mParsing: https://www.youtube.com/watch?v=Rq7QIvZCLpg[0m
[32mParsing: https://www.youtube.com/watch?v=vUYp74USacI[0m
[32mParsing: https://www.youtube.com/watch?v=KJ-bfcCLNZU[0m
[32mParsing: https://www.youtube.com/watch?v=T_ihqlYWEhk[0m
[32mParsing: https://www.youtube.com/watch?v=wgVNUAm5WOc[0m
[32mParsing: https://www.youtube.com/watch?v=lA1JrYe7JY8[0m
[32mParsing: https://www.youtube.com/watch?v=boiOYZADJIk[0m
[32mParsing: https://www.youtube.com/watch?v=4rAkT5meaFY[0m
[32mParsing: https://www.youtube.com/watch?v=Xiyx3e24csA[0m
[32mParsing: https://www.youtube.com/watch?v=frNPBOfJOqI[0m
[32mParsing: https://www.youtube.com/watch?v=lPrjP4A_X4s[0m
[32mParsing: https://www.youtube.com/watch?v=Tl8sKU7w_g8[0m
[32mParsing: https://www.youtube.com/watch?v=gbSRMwqPBuI[0m
[32mParsing: https://www.youtube.com/watch?v=2X7wem572ws[0m
[32mParsing: https://www.youtube.com/watch?v=Zm

ERROR: [youtube] FMnVvWnIFGs: Private video. Sign in if you've been granted access to this video


[31mFailed to get metadata for https://www.youtube.com/watch?v=FMnVvWnIFGs[0m
[32mParsing: https://www.youtube.com/watch?v=L-TjcdTdpZE[0m


ERROR: [youtube] L-TjcdTdpZE: Video unavailable. This video is no longer available due to a copyright claim by Dreamworks Animation


[31mFailed to get metadata for https://www.youtube.com/watch?v=L-TjcdTdpZE[0m
[32mParsing: https://www.youtube.com/watch?v=wVGig5cwt2g[0m
[32mParsing: https://www.youtube.com/watch?v=xAzidSLeOws[0m
[32mParsing: https://www.youtube.com/watch?v=1jmcUhrKfCI[0m
[32mParsing: https://www.youtube.com/watch?v=4oRrE4c3DNU[0m
[32mParsing: https://www.youtube.com/watch?v=oawM51GHZFQ[0m
[32mParsing: https://www.youtube.com/watch?v=RhoZYOI_OuE[0m
[32mParsing: https://www.youtube.com/watch?v=C4Vk-xEAi5s[0m
[32mParsing: https://www.youtube.com/watch?v=TWSI9uWPTcM[0m
[32mParsing: https://www.youtube.com/watch?v=jRw6I-bpoK0[0m
[32mParsing: https://www.youtube.com/watch?v=7cQ_db86UNA[0m
[32mParsing: https://www.youtube.com/watch?v=g66eNnZyj6k[0m
[32mParsing: https://www.youtube.com/watch?v=4mq9fN1Lg0A[0m
[32mParsing: https://www.youtube.com/watch?v=oTEROXKwGGI[0m
[32mParsing: https://www.youtube.com/watch?v=UD1QJzDp3kQ[0m
[32mParsing: https://www.youtube.com/watch?v=bX_E03

ERROR: [youtube] hhy2c_3W-KI: Video unavailable. This video is no longer available due to a copyright claim by Storyful Managed


[31mFailed to get metadata for https://www.youtube.com/watch?v=hhy2c_3W-KI[0m
[32mParsing: https://www.youtube.com/watch?v=6_Y74xuaplY[0m
[32mParsing: https://www.youtube.com/watch?v=3VJvK2TSkwc[0m
[32mParsing: https://www.youtube.com/watch?v=2UuElFsfnvk[0m
[32mParsing: https://www.youtube.com/watch?v=s3Kjpvp_APo[0m
[32mParsing: https://www.youtube.com/watch?v=sSdsZgS8kKE[0m
[32mParsing: https://www.youtube.com/watch?v=wV0XQEqilts[0m
[32mParsing: https://www.youtube.com/watch?v=FeSatb2Gong[0m
[32mParsing: https://www.youtube.com/watch?v=ll5leGSzGw0[0m
[32mParsing: https://www.youtube.com/watch?v=mXJlnwoJg3o[0m
[32mParsing: https://www.youtube.com/watch?v=u7Za1haeVJ0[0m
[32mParsing: https://www.youtube.com/watch?v=WSu0PuHpr-I[0m
[32mParsing: https://www.youtube.com/watch?v=-BiX9RcdiqU[0m
[32mParsing: https://www.youtube.com/watch?v=oHBv6iYUx-s[0m
[32mParsing: https://www.youtube.com/watch?v=lbfqmyn6TFM[0m
[32mParsing: https://www.youtube.com/watch?v=XtQYZn

ERROR: [youtube] 1AdTXYSjTto: Video unavailable. This video is no longer available due to a copyright claim by Indian Premier League


[31mFailed to get metadata for https://www.youtube.com/watch?v=1AdTXYSjTto[0m
[32mParsing: https://www.youtube.com/watch?v=V-m7zEkRRPY[0m
[32mParsing: https://www.youtube.com/watch?v=OlnP2UEAmn4[0m
[32mParsing: https://www.youtube.com/watch?v=xN_MTnUOFG0[0m
[32mParsing: https://www.youtube.com/watch?v=YUHoUXmY2E8[0m
[32mParsing: https://www.youtube.com/watch?v=7rUgKzBOZ8k[0m
[32mParsing: https://www.youtube.com/watch?v=louS1YzckqA[0m
[32mParsing: https://www.youtube.com/watch?v=W7Awv5r3xeg[0m
[32mParsing: https://www.youtube.com/watch?v=8c6ZYiwiPl0[0m
[32mParsing: https://www.youtube.com/watch?v=gA3TjGIVSZ4[0m
[32mParsing: https://www.youtube.com/watch?v=D2fC_Nrsvu8[0m
[32mParsing: https://www.youtube.com/watch?v=PR6cmnfuEnU[0m
[32mParsing: https://www.youtube.com/watch?v=JGl5DbVwSfY[0m
[32mParsing: https://www.youtube.com/watch?v=9I5hAefAVSA[0m
[32mParsing: https://www.youtube.com/watch?v=C9Us_MMt1_w[0m
[32mParsing: https://www.youtube.com/watch?v=_Q3cmx

ERROR: [youtube] z-zqn3qXqc8: Video unavailable


[31mFailed to get metadata for https://www.youtube.com/watch?v=z-zqn3qXqc8[0m
[32mParsing: https://www.youtube.com/watch?v=7y81KUQBwr4[0m
[32mParsing: https://www.youtube.com/watch?v=T0kWcDKggqw[0m
[32mParsing: https://www.youtube.com/watch?v=Khd6xbYp0KQ[0m
[32mParsing: https://www.youtube.com/watch?v=Z2eWbMgVlgQ[0m
[32mParsing: https://www.youtube.com/watch?v=-u03ogbdzfw[0m
[32mParsing: https://www.youtube.com/watch?v=7j5jSmNJ73U[0m
[32mParsing: https://www.youtube.com/watch?v=Xo_Rjv_wQLU[0m
[32mParsing: https://www.youtube.com/watch?v=8-a70-w23FY[0m
[32mParsing: https://www.youtube.com/watch?v=ErOtgWIBGEw[0m


ValueError: time data 'Watched https://www.youtube.com/watch?v=ErOtg' does not match format '%b %d %Y %I:%M:%S %p'