<a href="https://colab.research.google.com/github/shubhamk2001/YouTube-thumbnail-Scrapper/blob/main/Youtube_thumbnail_Scrapper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install yt-dlp

Collecting yt-dlp
  Downloading yt_dlp-2025.6.9-py3-none-any.whl.metadata (174 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/174.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m174.1/174.3 kB[0m [31m5.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.3/174.3 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading yt_dlp-2025.6.9-py3-none-any.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m45.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: yt-dlp
Successfully installed yt-dlp-2025.6.9


In [33]:
import yt_dlp
import pandas as pd
import re
from tqdm import tqdm

def clean_filename(text, max_length=80):
    # Remove illegal characters and truncate
    text = re.sub(r'[\\/*?:"<>|]', "", text)
    return text.strip()[:max_length]

def search_and_extract_filtered_videos(keyword, max_results=50, min_views=50000, skip_shorts=True):
    search_query = f"ytsearch{max_results}:{keyword}"

    ydl_opts_search = {
        'quiet': True,
        'skip_download': True,
        'extract_flat': True,
    }

    with yt_dlp.YoutubeDL(ydl_opts_search) as ydl:
        search_results = ydl.extract_info(search_query, download=False)['entries']

    video_urls = [f"https://www.youtube.com/watch?v={vid['id']}" for vid in search_results]

    videos_data = []
    ydl_opts_detail = {
        'quiet': True,
        'skip_download': True,
    }

    with yt_dlp.YoutubeDL(ydl_opts_detail) as ydl:
        for url in tqdm(video_urls, desc="Fetching full metadata"):
            try:
                info = ydl.extract_info(url, download=False)

                # Skip Shorts (typically < 60s)
                duration = info.get('duration', 0)
                if skip_shorts and duration <= 60:
                    continue

                views = info.get('view_count', 0)
                if views < min_views:
                    continue

                video_info = {
                    'Title': info.get('title'),
                    'Channel': info.get('uploader'),
                    'Views': views,
                    'Duration': info.get('duration_string'),
                    'Upload Date': info.get('upload_date'),
                    'Thumbnail': info.get('thumbnail'),
                    'URL': f"https://www.youtube.com/watch?v={info.get('id')}"
                }
                videos_data.append(video_info)

            except Exception as e:
                print(f"Error extracting {url}: {e}")
                continue

    df = pd.DataFrame(videos_data)
    df = df.sort_values(by='Views', ascending=False).reset_index(drop=True)
    return df


In [35]:
# Example usage
if __name__ == "__main__":
    niche = input("Enter niche keyword: ")
    df = search_and_extract_filtered_videos(niche, max_results=30)
    df.to_csv(f"{niche.replace(' ', '_')}_videos.csv", index=False)
    # print(df.head())

Enter niche keyword: blockchain


Fetching full metadata: 100%|██████████| 30/30 [01:44<00:00,  3.49s/it]


In [36]:
df.head()

Unnamed: 0,Title,Channel,Views,Duration,Upload Date,Thumbnail,URL
0,But how does bitcoin actually work?,3Blue1Brown,17072084,25:15,20170707,https://i.ytimg.com/vi_webp/bBC-nXj3Ng4/maxres...,https://www.youtube.com/watch?v=bBC-nXj3Ng4
1,How does a blockchain work - Simply Explained,Simply Explained,10203325,5:59,20171113,https://i.ytimg.com/vi_webp/SSo_EIwHSd4/maxres...,https://www.youtube.com/watch?v=SSo_EIwHSd4
2,Money Man - Blockchain (Official Video),Money Man,10024739,2:19,20211105,https://i.ytimg.com/vi/KQ7rn3oi-Pc/maxresdefau...,https://www.youtube.com/watch?v=KQ7rn3oi-Pc
3,1. Introduction for 15.S12 Blockchain and Mone...,MIT OpenCourseWare,7016951,1:02:03,20200123,https://i.ytimg.com/vi_webp/EH6vE97qIP4/maxres...,https://www.youtube.com/watch?v=EH6vE97qIP4
4,Blockchain Expert Explains One Concept in 5 Le...,WIRED,5695864,17:50,20171128,https://i.ytimg.com/vi/hYip_Vuv8J0/maxresdefau...,https://www.youtube.com/watch?v=hYip_Vuv8J0


In [40]:
import os
import requests

def download_thumbnails(df, folder="blockchain_thumbnails", max_images=20):
    os.makedirs(folder, exist_ok=True)

    for idx, row in tqdm(df.head(max_images).iterrows(), total=min(max_images, len(df)), desc="Downloading thumbnails"):
        url = row.get("Thumbnail")
        if not url:
            continue

        title = clean_filename(row['Title'])
        channel = clean_filename(row['Channel'])
        filename = f"{channel} - {title}.jpg"
        filepath = os.path.join(folder, filename)

        try:
            response = requests.get(url, timeout=10)
            response.raise_for_status()
            with open(filepath, 'wb') as f:
                f.write(response.content)
        except Exception as e:
            print(f"Error downloading {url}: {e}")


In [38]:
df

Unnamed: 0,Title,Channel,Views,Duration,Upload Date,Thumbnail,URL
0,But how does bitcoin actually work?,3Blue1Brown,17072084,25:15,20170707,https://i.ytimg.com/vi_webp/bBC-nXj3Ng4/maxres...,https://www.youtube.com/watch?v=bBC-nXj3Ng4
1,How does a blockchain work - Simply Explained,Simply Explained,10203325,5:59,20171113,https://i.ytimg.com/vi_webp/SSo_EIwHSd4/maxres...,https://www.youtube.com/watch?v=SSo_EIwHSd4
2,Money Man - Blockchain (Official Video),Money Man,10024739,2:19,20211105,https://i.ytimg.com/vi/KQ7rn3oi-Pc/maxresdefau...,https://www.youtube.com/watch?v=KQ7rn3oi-Pc
3,1. Introduction for 15.S12 Blockchain and Mone...,MIT OpenCourseWare,7016951,1:02:03,20200123,https://i.ytimg.com/vi_webp/EH6vE97qIP4/maxres...,https://www.youtube.com/watch?v=EH6vE97qIP4
4,Blockchain Expert Explains One Concept in 5 Le...,WIRED,5695864,17:50,20171128,https://i.ytimg.com/vi/hYip_Vuv8J0/maxresdefau...,https://www.youtube.com/watch?v=hYip_Vuv8J0
5,Watch Crypto expert explain the Blockchain to ...,CNET Highlights,5109262,5:56,20211208,https://i.ytimg.com/vi_webp/pSTNhBlfV_s/maxres...,https://www.youtube.com/watch?v=pSTNhBlfV_s
6,How the blockchain is changing money and busin...,TED,5042868,18:50,20160916,https://i.ytimg.com/vi_webp/Pl8OlkkwRpc/maxres...,https://www.youtube.com/watch?v=Pl8OlkkwRpc
7,Blockchain Technology Explained (2 Hour Course),Coding Tech,4187198,1:54:53,20180207,https://i.ytimg.com/vi_webp/qOVAbKKSH10/maxres...,https://www.youtube.com/watch?v=qOVAbKKSH10
8,Blockchain In 7 Minutes | What Is Blockchain |...,Simplilearn,3502164,7:03,20190227,https://i.ytimg.com/vi_webp/yubzJw0uiE4/maxres...,https://www.youtube.com/watch?v=yubzJw0uiE4
9,What is Blockchain,zlotolow,3429736,13:58,20160609,https://i.ytimg.com/vi/93E_GzvpMA0/maxresdefau...,https://www.youtube.com/watch?v=93E_GzvpMA0


In [41]:
download_thumbnails(df)

Downloading thumbnails: 100%|██████████| 20/20 [00:01<00:00, 19.42it/s]


In [44]:
import shutil

# Replace 'thumbnails' with your folder name
shutil.make_archive('blockthumbnails_zip', 'zip', 'blockchain_thumbnails')


'/content/blockthumbnails_zip.zip'

In [45]:
from google.colab import files

# This will download to your local machine
files.download('blockthumbnails_zip.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>