In [2]:
from dotenv import load_dotenv
from google.oauth2 import service_account
from googleapiclient.discovery import build
import json
import os
import pandas as pd
import re
from pathlib import Path
import time

## Json History Files

In [7]:
def get_videometa(video_id):
    api_service_name = "youtube"
    api_version = "v3"
    service_account_file = "/Users/saisandeep/GitRepo/serivce_files/youtube-data-analysis-service.json"  # Replace with the path to your service account file

    credentials = service_account.Credentials.from_service_account_file(
        service_account_file,
        scopes=["https://www.googleapis.com/auth/youtube.readonly"]
    )

    youtube = googleapiclient.discovery.build(api_service_name, api_version, credentials=credentials)

    # Make the API request
    request = youtube.videos().list(
        # part="statistics",
        part="snippet,contentDetails,statistics,status,player,topicDetails",
        id=video_id
    )
    response = request.execute()    
    
    data = []
    for item in response['items']:
        video_info = {
            'id': item['id'],
            'title': item['snippet']['title'],
            'description': item['snippet']['description'],
            'publishedAt': item['snippet']['publishedAt'],
            'channelId': item['snippet']['channelId'],
            'channelTitle': item['snippet']['channelTitle'],
            'viewCount': item['statistics']['viewCount'],
            'likeCount': item['statistics']['likeCount'],
            'duration': item['contentDetails']['duration'],
            'privacyStatus': item['status']['privacyStatus'],
            # 'tags': ', '.join(item['snippet'].get('tags', []))
            'tags': item['snippet'].get('tags', [])
        }
        data.append(video_info)

    # Create DataFrame
    df = pd.DataFrame(data)
    return df


In [30]:
# Function to convert ISO 8601 duration to minutes
def convert_duration_to_minutes(duration):
    import re
    match = re.match(r'PT(?:(\d+)M)?(?:(\d+)S)?', duration)
    minutes = int(match.group(1) or 0)
    seconds = int(match.group(2) or 0)
    return minutes + seconds / 60

In [8]:
env_path = Path('/Users/saisandeep/GitRepo/musical-eureka/.env')
load_dotenv(env_path)
os.getenv('YOUTUBE_API_KEY')

# Load the JSON data from the file
with open('/Users/saisandeep/GitRepo/musical-eureka/YoutubeDataAnalysis/Data/Takeout/YouTube and YouTube Music/history/watch-history.json', 'r') as file:
    data = json.load(file)

# Initialize a list to hold the extracted information
extracted_data = []

# Loop through each entry in the JSON data
for entry in data:
    # Extract relevant information
    entry_data = {
        'header': entry.get('header'),
        'title': entry.get('title'),
        'titleUrl': entry.get('titleUrl'),
        'subtitles': ', '.join([subtitle['name'] for subtitle in entry.get('subtitles', [])]),
        'time': entry.get('time'),
        'products': ', '.join(entry.get('products', [])),
        'activityControls': ', '.join(entry.get('activityControls', []))
    }
    extracted_data.append(entry_data)

# Create a DataFrame from the extracted data
df = pd.DataFrame(extracted_data)
df['video_id'] = df['titleUrl'].apply(lambda x: re.search(r'.*v=([^&]+)', x).group(1) if re.search(r'.*v=([^&]+)', x) else None)
# Convert the 'time' column to datetime with ISO8601 format
df['time'] = pd.to_datetime(df['time'], format='ISO8601')

# Extract the date from the 'time' column
df['watch_date'] = df['time'].dt.date

df=df.sort_values(by='watch_date',ascending=False)
df.to_csv('/Users/saisandeep/GitRepo/musical-eureka/YoutubeDataAnalysis/Data/watch_history.csv',index=False)
df_video_ids = df.drop_duplicates(subset=['video_id'])
df_video_ids = df_video_ids['video_id']
df_video_ids.to_csv('/Users/saisandeep/GitRepo/musical-eureka/YoutubeDataAnalysis/Data/video_ids.csv',index=False)


## Version 2

In [None]:
# Steps
# 1. Read the video_ids from the csv file
# 2. Create a set of video_ids from video_ids.csv and read the video_metadata from the csv file and create a set of video_ids from that
# 3. Compare the two sets and get the difference
# 4. From the difference get the video_ids 
# 5. Try to get the data for each video_id in the difference set
# 6. If there is an error and if it is due to limit breach store the video_id and its index in a .txt file
# 7. Incase of no error store the data in df and append it to the csv file

# Step 1: Read the video_ids from the csv file
df_video_ids = pd.read_csv('/Users/saisandeep/GitRepo/musical-eureka/YoutubeDataAnalysis/Data/video_ids.csv',header=0)
df_video_ids_set = set(df_video_ids['video_id'])

# Step 2: Read the video_metadata from the csv file
df_video_metadata_new = pd.read_csv('/Users/saisandeep/GitRepo/musical-eureka/YoutubeDataAnalysis/Data/video_metadata_new.csv',header=0)
df_video_metadata_new_set = set(df_video_metadata_new['id'])

# Step 3: Compare the two sets and get the difference
df_video_ids_diff = df_video_ids_set - df_video_metadata_new_set

# Initialize an empty DataFrame to store the data
df_data = pd.DataFrame()

# Loop through each video_id in the difference set
for index, video_id in enumerate(df_video_ids_diff):
    if index % 10000 == 0 and index != 0:  # Daily limit
        break
    try:
        print(video_id, index)
        # Step 5: Try to get the data for each video_id
        response = get_videometa(video_id)
        response.to_csv('/Users/saisandeep/GitRepo/musical-eureka/YoutubeDataAnalysis/Data/video_metadata_new.csv', mode='a', index=False, header=False)
    except Exception as e:
        # Step 6: If there is an error and if it is due to limit breach store the video_id and its index in a .txt file
        if 'limit' in str(e):
            with open('/Users/saisandeep/GitRepo/musical-eureka/YoutubeDataAnalysis/Data/limit_breach.txt', 'a') as file:
                file.write(f'{video_id}: {index}\n')
        else:
            print(e)
            # If there is any other error apart from the limit breach, continue with the next video_id
            continue


## Version 1

In [None]:
# # Steps
# # 1. Read the video_ids from the csv file
# # 2. Read the index from in .txt file if it null or empty then start from 0 else read the last index and start from there
# # 3. Try to get the data for each video_id
# # 4. If there is an error and if it is due to limit breach store the video_id and its index in a .txt file
# # 5. Incase of now error store the data in df and append it to the csv file

# # Step 1: Read the video_ids from the csv file
# df_video_ids = pd.read_csv('/Users/saisandeep/GitRepo/musical-eureka/YoutubeDataAnalysis/Data/video_ids.csv',header=0)

# # Step 2: Read the index from in .txt file if it null or empty then start from 0 else read the last index and start from there
# index_file = '/Users/saisandeep/GitRepo/musical-eureka/YoutubeDataAnalysis/Data/index.txt'
# if os.path.exists(index_file):
#     with open(index_file, 'r') as file:
#         last_index = int(file.read())
# else:
#     last_index = 1

# # Initialize an empty DataFrame to store the data
# df_data = pd.DataFrame()

# # Loop through each video_id starting from the last index, with a limit of 100 every 5 minutes and 10000 every day
# for index, video_id in df_video_ids[last_index:].iterrows():
#     if index % 1000 == 0 and index != 0:  # Daily limit
#         break
#     if index % 100 == 0 and index != 0:  # 5-minute limit
#         time.sleep(300)  # Sleep for 5 minutes
#     if index > 100:
#         print("Testing the limit")
#         break
#     try:
#         print(video_id.values[0],index)
#         # Step 3: Try to get the data for each video_id
#         response = get_videometa(video_id.values[0])
#         # Step 5: Incase of no error store the data in df and append it to the csv file
#         df_data = df_data._append(response, ignore_index=True)
#     except Exception as e:
#         # Step 4: If there is an error and if it is due to limit breach store the video_id and its index in a .txt file
#         if 'limit' in str(e):
#             with open('/Users/saisandeep/GitRepo/musical-eureka/YoutubeDataAnalysis/Data/limit_breach.txt', 'a') as file:
#                 file.write(f'{video_id.values[0]}: {index + last_index}\n')
#         else:
#             print(e)
#             # If there is any other error apart from the limit breach, continue with the next video_id
#             continue

# df_data.to_csv('/Users/saisandeep/GitRepo/musical-eureka/YoutubeDataAnalysis/Data/video_metadata.csv', mode='a', index=False, header=False)
# # Update the last index in the index.txt file
# with open(index_file, 'w') as file:
#     file.write(str(index + last_index))



In [26]:
df_video_ids = pd.read_csv('/Users/saisandeep/GitRepo/musical-eureka/YoutubeDataAnalysis/Data/video_ids.csv',header=0)
df_video_ids_set = set(df_video_ids['video_id'])

df_video_metadata_new = pd.read_csv('/Users/saisandeep/GitRepo/musical-eureka/YoutubeDataAnalysis/Data/video_metadata_new.csv',header=0)
df_video_metadata_new_set = set(df_video_metadata_new['id'])

# Step 3: Compare the two sets and get the difference
df_video_ids_diff = df_video_ids_set - df_video_metadata_new_set

for index, video_id in enumerate(df_video_ids_diff):
    print(index,video_id)

0 vAul4RIbjFA
1 pY4rsat24B4
2 Kvo-V3RwVIM
3 R127a7HxlC4
4 Jg5b7PI3lLA
5 U0b9WsC9pSA
6 ESWHVtbBMlM
7 RxpEcssphQQ
8 JtNLFZH_vQY
9 0fXGpb3dQIo
10 mja48c3_MIw
11 N037sJfqAHo
12 2pIECaL1-aw
13 qVom0hewlTY
14 bKn81rRIKDU
15 lk4lraE0wxU
16 8hlrh7eOF8Y
17 o6VlZMDtvpE
18 zWQvYGK4fJE
19 BacqZxrxU5I
20 mINjbXfZXdM
21 jzi7Mhfhceg
22 oAQKBPsQPME
23 6NqrzK4dmP0
24 nkK2pQ7HoJg
25 Zl0WFjh77C4
26 FoWS2Z8hr1U
27 UOf6uHwB3Tk
28 8ZH8iKqT2qE
29 6GEoD4BKs6k
30 ZylW-6aOX-0
31 Fg7ks75O6Xs
32 jUrHXKeEOYw
33 EO8z7W2co6w
34 tZlXTR1DmyA
35 tgG8x00TI1k
36 99WiOcSt9bA
37 tM9qCOGSX6k
38 2gHO1-cy058
39 8VB2Zyv1FOQ
40 wF-IWQ0Ba4w
41 uR_Lf4r5Dfc
42 0Z5KlMmjeyk
43 fq5DCOdX4Bc
44 XTqZEb-0tU0
45 Pp0TwEKsn-s
46 gDMdtHWgP5w
47 MMMXDxaa1C0
48 deZX3c3ygCc
49 JRkOT_w--zA
50 ygCZ8NndNxs
51 TKlsidzM6P4
52 vOQfjBiWUsg
53 2la4zF5J5Z0
54 tE1Yx8Am-O8
55 Tgm9Y7aj0pU
56 dgibtDIOWds
57 czkGj5vJEFQ
58 Eot1dzEf3WE
59 riGfYBI7YKQ
60 TWgkaxEhdpA
61 y9exjxhsmus
62 WtLIJwObk2M
63 bQXOwROVilU
64 oacQGjbycK8
65 mk4lmz-ydnk
66 uzMgQrY7QOU
67 Jv

In [18]:
df_video_ids = pd.read_csv('/Users/saisandeep/GitRepo/musical-eureka/YoutubeDataAnalysis/Data/video_ids.csv',header=0)
last_index = 10000
for index, video_id in df_video_ids[last_index:].iterrows():
    print(index,video_id.values[0])
    break

10000 wIo7IJxsg3A


In [None]:
response={'kind': 'youtube#videoListResponse', 'etag': 'ummYaeLJcKqcGF5M7jE8YodCz8Q', 'items': [{'kind': 'youtube#video', 'etag': 'BtsscKPdEXY5P3erTmYoNM3Jwcg', 'id': 'JKlYOUfviXM', 'snippet': {'publishedAt': '2018-07-26T02:50:43Z', 'channelId': 'UCA3Zs9A2IeetfGT3Omi9ivw', 'title': 'Take Me To Church', 'description': 'Provided to YouTube by Universal Music Group\n\nTake Me To Church · Hozier\n\nHozier\n\n℗ 2014 Rubyworks Limited, under assignment to Island Records, a division of Universal Music Operations Limited\n\nReleased on: 2014-09-19\n\nProducer, Co- Producer, Associated  Performer, Vocals: Hozier\nProducer, Co- Producer: Rob Kirwan\nAssociated  Performer, Drums: FIACHRA KINDER\nStudio  Personnel, Mixer: Rob Kirwan\nComposer  Lyricist: Andrew Hozier-Byrne\n\nAuto-generated by YouTube.', 'thumbnails': {'default': {'url': 'https://i.ytimg.com/vi/JKlYOUfviXM/default.jpg', 'width': 120, 'height': 90}, 'medium': {'url': 'https://i.ytimg.com/vi/JKlYOUfviXM/mqdefault.jpg', 'width': 320, 'height': 180}, 'high': {'url': 'https://i.ytimg.com/vi/JKlYOUfviXM/hqdefault.jpg', 'width': 480, 'height': 360}, 'standard': {'url': 'https://i.ytimg.com/vi/JKlYOUfviXM/sddefault.jpg', 'width': 640, 'height': 480}, 'maxres': {'url': 'https://i.ytimg.com/vi/JKlYOUfviXM/maxresdefault.jpg', 'width': 1280, 'height': 720}}, 'channelTitle': 'Hozier - Topic', 'tags': ['Hozier', 'ホージア', 'Take Me To Church'], 'categoryId': '10', 'liveBroadcastContent': 'none', 'localized': {'title': 'Take Me To Church', 'description': 'Provided to YouTube by Universal Music Group\n\nTake Me To Church · Hozier\n\nHozier\n\n℗ 2014 Rubyworks Limited, under assignment to Island Records, a division of Universal Music Operations Limited\n\nReleased on: 2014-09-19\n\nProducer, Co- Producer, Associated  Performer, Vocals: Hozier\nProducer, Co- Producer: Rob Kirwan\nAssociated  Performer, Drums: FIACHRA KINDER\nStudio  Personnel, Mixer: Rob Kirwan\nComposer  Lyricist: Andrew Hozier-Byrne\n\nAuto-generated by YouTube.'}}, 'contentDetails': {'duration': 'PT4M2S', 'dimension': '2d', 'definition': 'hd', 'caption': 'false', 'licensedContent': True, 'regionRestriction': {'allowed': ['AE', 'AR', 'AT', 'AW', 'AZ', 'BA', 'BD', 'BE', 'BG', 'BH', 'BM', 'BO', 'BR', 'BY', 'CH', 'CL', 'CO', 'CR', 'CY', 'CZ', 'DE', 'DK', 'DO', 'DZ', 'EC', 'EE', 'EG', 'ES', 'FI', 'FR', 'GB', 'GH', 'GR', 'GT', 'HK', 'HN', 'HR', 'HU', 'ID', 'IE', 'IL', 'IN', 'IQ', 'IS', 'IT', 'JM', 'JO', 'JP', 'KE', 'KH', 'KR', 'KW', 'KY', 'KZ', 'LA', 'LB', 'LI', 'LK', 'LT', 'LU', 'LV', 'LY', 'MA', 'MK', 'MQ', 'MT', 'MX', 'MY', 'NC', 'NG', 'NI', 'NL', 'NO', 'NP', 'NZ', 'OM', 'PA', 'PE', 'PG', 'PH', 'PK', 'PL', 'PT', 'PY', 'QA', 'RE', 'RO', 'RS', 'RU', 'SA', 'SE', 'SG', 'SI', 'SK', 'SN', 'SV', 'TC', 'TH', 'TN', 'TR', 'TW', 'TZ', 'UA', 'UG', 'UY', 'VE', 'VN', 'YE', 'YT', 'ZA', 'ZW']}, 'contentRating': {}, 'projection': 'rectangular'}, 'status': {'uploadStatus': 'processed', 'privacyStatus': 'public', 'license': 'youtube', 'embeddable': True, 'publicStatsViewable': True, 'madeForKids': False}, 'statistics': {'viewCount': '153991671', 'likeCount': '1305756', 'favoriteCount': '0', 'commentCount': '944'}, 'player': {'embedHtml': '<iframe width="480" height="360" src="//www.youtube.com/embed/JKlYOUfviXM" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" allowfullscreen></iframe>'}, 'topicDetails': {'topicCategories': ['https://en.wikipedia.org/wiki/Independent_music', 'https://en.wikipedia.org/wiki/Music', 'https://en.wikipedia.org/wiki/Pop_music']}}], 'pageInfo': {'totalResults': 1, 'resultsPerPage': 1}}

def get_videometa(video_id):
    api_service_name = "youtube"
    api_version = "v3"
    service_account_file = "/Users/saisandeep/GitRepo/serivce_files/youtube-data-analysis-437204-60b449358b29.json"  # Replace with the path to your service account file

    credentials = service_account.Credentials.from_service_account_file(
        service_account_file,
        scopes=["https://www.googleapis.com/auth/youtube.readonly"]
    )

    youtube = googleapiclient.discovery.build(api_service_name, api_version, credentials=credentials)

    # Make the API request
    request = youtube.videos().list(
        # part="statistics",
        part="snippet,contentDetails,statistics,status,player,topicDetails",
        id=video_id
    )
    response = request.execute()    
    
    data = []
    for item in response['items']:
        video_info = {
            'id': item['id'],
            'title': item['snippet']['title'],
            'description': item['snippet']['description'],
            'publishedAt': item['snippet']['publishedAt'],
            'channelId': item['snippet']['channelId'],
            'channelTitle': item['snippet']['channelTitle'],
            'viewCount': item['statistics']['viewCount'],
            'likeCount': item['statistics']['likeCount'],
            'duration': item['contentDetails']['duration'],
            'privacyStatus': item['status']['privacyStatus'],
            # 'tags': ', '.join(item['snippet'].get('tags', []))
            'tags': item['snippet'].get('tags', [])
        }
        data.append(video_info)

    # Create DataFrame
    df = pd.DataFrame(data)
    return df


In [5]:
# df = pd.read_csv('/Users/saisandeep/GitRepo/musical-eureka/YoutubeDataAnalysis/Data/video_metadata.csv',header=0)

df

Unnamed: 0,JKlYOUfviXM,Take Me To Church,"Provided to YouTube by Universal Music Group\n\nTake Me To Church · Hozier\n\nHozier\n\n℗ 2014 Rubyworks Limited, under assignment to Island Records, a division of Universal Music Operations Limited\n\nReleased on: 2014-09-19\n\nProducer, Co- Producer, Associated Performer, Vocals: Hozier\nProducer, Co- Producer: Rob Kirwan\nAssociated Performer, Drums: FIACHRA KINDER\nStudio Personnel, Mixer: Rob Kirwan\nComposer Lyricist: Andrew Hozier-Byrne\n\nAuto-generated by YouTube.",2018-07-26T02:50:43Z,UCA3Zs9A2IeetfGT3Omi9ivw,Hozier - Topic,154386360,1307916,PT4M2S,public,"['Hozier', 'ホージア', 'Take Me To Church']"
0,wgVNUAm5WOc,Memes I Found On The internet🤣 part 50 #shorts...,,2023-12-04T14:30:29Z,UCwZahaEiDZX3uHBF-a6-n9Q,Wtf Bro!! Clips,6741236,429123,PT1M,public,[]
1,K1FlAphL2p8,Stressed Out,Provided to YouTube by Fueled By Ramen\n\nStre...,2017-02-09T12:10:05Z,UCnX0L9QiftAcWdzeBx31xCw,Twenty One Pilots - Topic,450802011,4263799,PT3M23S,public,"['twenty one pilots', 'Twenty One Pilots', 'Bl..."
2,2X7wem572ws,"A chaotic scene unfolded as 25,000 job-seekers...",,2024-07-17T06:05:56Z,UCCgQaXjsJ6EdFPZeTFQoSyw,Brut India,72145,1556,PT52S,public,[]
3,gbSRMwqPBuI,The REALITY of Living in BANGALORE 🏙️ | Ishan ...,The REALITY of Living in BANGALORE 🏙️ | Ishan ...,2024-04-03T10:19:06Z,UCY6N8zZhs2V7gNTUxPuKWoQ,Ishan Sharma,222869,14104,PT42S,public,"['ishan sharma', 'The REALITY of Living in BAN..."
4,Tl8sKU7w_g8,Which GOLD can make you the RICHEST? | Ankur W...,----------------------------------------------...,2024-03-15T12:36:32Z,UCRzYN32xtBf3Yxsx5BvJWJw,warikoo,792592,49395,PT36S,public,"['warikoo', 'ankur warikoo', 'Ankur Warikoo mo..."
...,...,...,...,...,...,...,...,...,...,...,...
9913,VwsTZO_Tcv4,Tera Baap Aaya,Provided to YouTube by Zee Entertainment Enter...,2019-11-05T05:36:30Z,UCz9QXI60wdDuZ1ylt3CtKSg,Farhad Bhiwandiwala - Topic,23937449,176506,PT2M45S,public,"['Farhad Bhiwandiwala', 'Commando 3', 'Tera Ba..."
9914,GI2oC-iJSJU,Rajarshi Nandy Explains Kamakhya Temple’s Unkn...,Check out the Bhairav Series by Rajarshi Nandy...,2024-04-23T16:18:00Z,UCPxMZIFE856tbTfdkdjzTSQ,BeerBiceps,1187113,33235,PT1H14M52S,public,"['podcast', 'indian podcast', 'indian podcasts..."
9915,ErDDPIDMh6Y,Fantasy trading: do they really make money? | ...,Let's talk about fantasy trading (a.k.a. paper...,2024-04-23T12:00:07Z,UCUUlw3anBIkbW9W44Y-eURw,Zero1 by Zerodha,37070,1153,PT12M17S,public,"['paper trading', 'fantasy trading apps', 'bes..."
9916,sHZ9nXbzEtQ,Comedians Roast Tech Bro Who Raps in Mandarin,GET TOUR TICKETS: https://sociallyinept.io\n\n...,2024-01-12T22:04:17Z,UCFe2Kq8Hg15UomoVYdmRg_Q,Socially Inept,36413,625,PT2M54S,public,[]


## HTML File

In [None]:
def parse_watch_history(html_file):
    if not os.path.exists(html_file):
        raise FileNotFoundError(f"Cannot find {html_file}")

    with open(html_file, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'lxml')

    # Find all watch history entries
    entries = soup.find_all('div', class_='content-cell mdl-cell mdl-cell--6-col mdl-typography--body-1')

    data = []
    for entry in entries:
        # Extract the link and title
        link = div.find('a', href=re.compile(r'https://www.youtube.com/watch\?v=.+'))
        if link:
            meta = get_metadata(link['href'])


        # Extract the timestamp
        timestamp_tag = entry.find('span')
        if timestamp_tag:
            timestamp_str = timestamp_tag.get_text(strip=True)
            # Parse the timestamp string to a datetime object
            try:
                # Example format: 'Mon, 01 Jan 2023 12:34:56 GMT'
                timestamp = datetime.strptime(timestamp_str, '%a, %d %b %Y %H:%M:%S GMT')
            except ValueError:
                timestamp = None
        else:
            timestamp = None

        # Extract video ID from URL
        # video_id = extract_video_id(url) if url else None

        data.append({
            'title': title,
            'url': url,
            # 'video_id': video_id,
            'time': timestamp
        })

    # Convert to DataFrame
    df = pd.DataFrame(data)
    # Drop entries without video_id or timestamp
    # df = df.dropna(subset=['video_id', 'time']).reset_index(drop=True)
    return df

# Path to your watch-history.html file
WATCH_HISTORY_FILE = '/Users/saisandeep/GitRepo/musical-eureka/YoutubeDataAnalysis/Takeout/YouTube and YouTube Music/history/watch-history.html'
temp=parse_watch_history(WATCH_HISTORY_FILE)

In [None]:
# html_file = '/Users/saisandeep/GitRepo/musical-eureka/YoutubeDataAnalysis/Takeout/YouTube and YouTube Music/history/tidy-watch-history.html'
# with open(html_file, 'r', encoding='utf-8') as file:
#a
#      soup = BeautifulSoup(file, 'lxml')

# # Find all watch history entries
# entries = soup.find_all('div', class_='content-cell mdl-cell mdl-cell--6-col mdl-typography--body-1')

# df = pd.DataFrame()
# data_dict = {
#     'status': [],
#     'link': [],
#     'watched_ts': []
# }
# for entry in entries:  # Limit to first 10 entries
#     entry_text = entry.text.strip().strip('\n')
#     try:
#         status, watched_info = entry_text.split('\n')[0], entry_text.split('\n')[2]
#         # print(status,watched_info)
#         temp = status.replace('\xa0', ' ').split(' ')[0]
#         data_dict['status'].append(temp)
#         link = re.search(r'(https?://[^\s]+)', status).group(0)
#         data_dict['link'].append(link)
#         watched_ts = datetime.strptime(watched_info.strip(), '%d %b %Y, %H:%M:%S %Z')
#         data_dict['watched_ts'].append(watched_ts)
#     except ValueError as e:
#         print(f"Error processing entry: {e}")
#     except Exception as e:
#         print(f"Unexpected error: {e}")      
#         print(entry_text)  

# df = pd.DataFrame(data_dict)
# pattern = r'.*v=([^&]+)'
# # df['video_id'] = [re.search(pattern, i).group(1) for i in df.link]
# df['video_id'] = df['link'].apply(lambda x: re.search(r'.*v=([^&]+)', x).group(1) if re.search(r'.*v=([^&]+)', x) else None)

#    # print('a',entries[j].a['href'])
#     # "https://music.youtube.com/watch?v=JKlYOUfviXM"  
#     # pattern = r'.*v=([^&]+)'
#     # video_id = re.search(pattern, entries[j].a['href'])
#     # video_id = video_id.group(1)    
#     # if j >10:
#     #     break

In [None]:
df

# Anaylsis

In [76]:
import ast
import numpy as np
import pandas as pd
import streamlit as st

In [60]:
df_video_metadata_new = pd.read_csv('/Users/saisandeep/GitRepo/musical-eureka/YoutubeDataAnalysis/Data/video_metadata_new.csv',header=0)
df_watch_history = pd.read_csv('/Users/saisandeep/GitRepo/musical-eureka/YoutubeDataAnalysis/Data/watch_history.csv',header=0)
df_final = df_watch_history.merge(df_video_metadata_new,left_on='video_id',right_on='id',how='left',suffixes=('_watch_history','_video_metadata'))
df_final = df_final.dropna(subset=['video_id'])
df_final = df_final.drop_duplicates(subset=['time','video_id'])



In [61]:
df_final.head()

Unnamed: 0,header,title_watch_history,titleUrl,subtitles,time,products,activityControls,video_id,watch_date,id,title_video_metadata,description,publishedAt,channelId,channelTitle,viewCount,likeCount,duration,privacyStatus,tags
0,YouTube Music,Watched Take Me To Church,https://music.youtube.com/watch?v=JKlYOUfviXM,Hozier - Topic,2024-07-20 06:44:15.248000+00:00,YouTube,YouTube watch history,JKlYOUfviXM,2024-07-20,JKlYOUfviXM,Take Me To Church,Provided to YouTube by Universal Music Group\n...,2018-07-26T02:50:43Z,UCA3Zs9A2IeetfGT3Omi9ivw,Hozier - Topic,154386360.0,1307916.0,PT4M2S,public,"['Hozier', 'ホージア', 'Take Me To Church']"
2,YouTube,Watched Memes I Found On The internet🤣 part 50...,https://www.youtube.com/watch?v=wgVNUAm5WOc,Wtf Bro!! Clips,2024-07-20 05:52:12.901000+00:00,YouTube,YouTube watch history,wgVNUAm5WOc,2024-07-20,wgVNUAm5WOc,Memes I Found On The internet🤣 part 50 #shorts...,,2023-12-04T14:30:29Z,UCwZahaEiDZX3uHBF-a6-n9Q,Wtf Bro!! Clips,6741236.0,429123.0,PT1M,public,[]
4,YouTube Music,Watched Stressed Out,https://music.youtube.com/watch?v=K1FlAphL2p8,Twenty One Pilots - Topic,2024-07-20 06:40:52.742000+00:00,YouTube,YouTube watch history,K1FlAphL2p8,2024-07-20,K1FlAphL2p8,Stressed Out,Provided to YouTube by Fueled By Ramen\n\nStre...,2017-02-09T12:10:05Z,UCnX0L9QiftAcWdzeBx31xCw,Twenty One Pilots - Topic,450802011.0,4263799.0,PT3M23S,public,"['twenty one pilots', 'Twenty One Pilots', 'Bl..."
6,YouTube,"Watched A chaotic scene unfolded as 25,000 job...",https://www.youtube.com/watch?v=2X7wem572ws,Brut India,2024-07-20 05:30:35.507000+00:00,YouTube,YouTube watch history,2X7wem572ws,2024-07-20,2X7wem572ws,"A chaotic scene unfolded as 25,000 job-seekers...",,2024-07-17T06:05:56Z,UCCgQaXjsJ6EdFPZeTFQoSyw,Brut India,72145.0,1556.0,PT52S,public,[]
8,YouTube,Watched The REALITY of Living in BANGALORE 🏙️ ...,https://www.youtube.com/watch?v=gbSRMwqPBuI,Ishan Sharma,2024-07-20 05:31:29.099000+00:00,YouTube,YouTube watch history,gbSRMwqPBuI,2024-07-20,gbSRMwqPBuI,The REALITY of Living in BANGALORE 🏙️ | Ishan ...,The REALITY of Living in BANGALORE 🏙️ | Ishan ...,2024-04-03T10:19:06Z,UCY6N8zZhs2V7gNTUxPuKWoQ,Ishan Sharma,222869.0,14104.0,PT42S,public,"['ishan sharma', 'The REALITY of Living in BAN..."


Object `st.streamlit.sidebar.input_text` not found.


In [78]:
# temp = df_final[~df_final['tags'].isna()]
# temp = temp[~temp['tags'].str.contains('[]')]
# temp.head()


# Filter out rows where 'tags' is NaN and 'tags' list is not empty
temp = df_final[df_final['tags'].notna()]
temp = temp[~temp['tags'].str.contains(r'\[\]', regex=True)]
temp['tags'] = temp['tags'].apply(lambda x: x.replace('[','').replace(']',''))
temp['tags'] = temp['tags'].apply(ast.literal_eval)
temp = temp.explode('tags')
temp.head()



temp_agg = temp.groupby('date').agg(num_of_videos=('video_id',pd.Series.nunique)).reset_index().sort_values(by='num_of_videos',ascending=False)


In [83]:
temp.to_csv('/Users/saisandeep/GitRepo/musical-eureka/YoutubeDataAnalysis/Data/temp_agg.csv',index=False)

# df_final.to_csv('/Users/saisandeep/GitRepo/musical-eureka/YoutubeDataAnalysis/Data/df_final.csv',index=False)

In [81]:
temp.columns

Index(['header', 'title_watch_history', 'titleUrl', 'subtitles', 'time',
       'products', 'activityControls', 'video_id', 'watch_date', 'id',
       'title_video_metadata', 'description', 'publishedAt', 'channelId',
       'channelTitle', 'viewCount', 'likeCount', 'duration', 'privacyStatus',
       'tags'],
      dtype='object')