# Web Scraping Hugging Face Leaderboard Page

### Importing Libraries

In [13]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

### Web Scraping using Beautiful soup

In [14]:
model_names = []
descriptions = []
downloads = []
likes = []

for page in range(1, 11):
    url = f'https://huggingface.co/models?p={page}&sort=trending'
    response = requests.get(url)
    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all article tags with the class 'overview-card-wrapper'
        model_articles = soup.find_all('article', class_='overview-card-wrapper')

        # Loop through each article tag and extract information
        for article in model_articles:
            # Extract the model name
            model_name = article.select_one('.text-smd').text
            model_names.append(model_name)

            # Extract the description (assuming it's the text in the div before the spans)
            description = article.find('svg', class_='mr-1.5 text-[.8rem]')
            description = description.next_sibling.strip() if description else 'N/A'
            descriptions.append(description)

            # Extract the number of downloads (using .next_sibling to get the text after the SVG tag)
            downloads_svg = article.find('svg', class_='flex-none w-3 text-gray-400 mr-0.5')
            download_count = downloads_svg.next_sibling.strip() if downloads_svg else 'N/A'
            downloads.append(download_count)

            # Extract the number of likes (assuming it's the text after the second span)
            likes_svg = article.find('svg', class_='flex-none w-3 text-gray-400 mr-1')
            like_count = likes_svg.next_sibling.strip() if likes_svg else 'N/A'
            likes.append(like_count)

    else:
        print('Failed to retrieve the webpage')

# Create the DataFrame after collecting all data
df = pd.DataFrame({
    'Model Name': model_names,
    'Description': descriptions,
    'Downloads': downloads,
    'Likes': likes
})

df.head(100)


Unnamed: 0,Model Name,Description,Downloads,Likes
0,nomic-ai/nomic-embed-vision-v1.5,Image Feature Extraction,614,40
1,Qwen/Qwen2-0.5B-Instruct,Text Generation,11.9k,39
2,yodayo-ai/holodayo-xl-2.1,Text-to-Image,2.65k,38
3,Qwen/Qwen2-57B-A14B-Instruct,Text Generation,20.2k,38
4,microsoft/Phi-3-mini-4k-instruct,Text Generation,1.26M,670
...,...,...,...,...
95,ByteDance/Hyper-SD,Text-to-Image,150k,522
96,Orenguteng/Llama-3-8B-Lexi-Uncensored,Text Generation,8.92k,113
97,Orenguteng/Llama-3-8B-Lexi-Uncensored-GGUF,,54.9k,77
98,jasperai/flash-pixart,Text-to-Image,933,12


### Extracted data info

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Model Name   300 non-null    object
 1   Description  300 non-null    object
 2   Downloads    300 non-null    object
 3   Likes        300 non-null    object
dtypes: object(4)
memory usage: 9.5+ KB


### Function to extract mutliple columns from model_name

In [16]:
def split_column(df, column_name, new_col1, new_col2):
    new_cols = df[column_name].str.split('/', n=1, expand=True)
    new_cols.columns = [new_col1, new_col2]
    df = pd.concat([df, new_cols], axis=1)
    return df

In [17]:
df = split_column(df,'Model Name','Organization','Model_Name')

In [18]:
df.head()

Unnamed: 0,Model Name,Description,Downloads,Likes,Organization,Model_Name
0,nomic-ai/nomic-embed-vision-v1.5,Image Feature Extraction,614,40,nomic-ai,nomic-embed-vision-v1.5
1,Qwen/Qwen2-0.5B-Instruct,Text Generation,11.9k,39,Qwen,Qwen2-0.5B-Instruct
2,yodayo-ai/holodayo-xl-2.1,Text-to-Image,2.65k,38,yodayo-ai,holodayo-xl-2.1
3,Qwen/Qwen2-57B-A14B-Instruct,Text Generation,20.2k,38,Qwen,Qwen2-57B-A14B-Instruct
4,microsoft/Phi-3-mini-4k-instruct,Text Generation,1.26M,670,microsoft,Phi-3-mini-4k-instruct


### Function to convert likes and download count into integers

In [19]:
def convert_count(count):
    if 'k' in count:
        return int(float(count.replace('k', '')) * 1000)
    elif 'M' in count:
        return int(float(count.replace('M', '')) * 1000000)
    else:
        return int(count)

Drop N/A values

In [20]:
import numpy as np
df.replace('N/A', np.nan, inplace=True)
df = df.dropna()
df



Unnamed: 0,Model Name,Description,Downloads,Likes,Organization,Model_Name
0,nomic-ai/nomic-embed-vision-v1.5,Image Feature Extraction,614,40,nomic-ai,nomic-embed-vision-v1.5
1,Qwen/Qwen2-0.5B-Instruct,Text Generation,11.9k,39,Qwen,Qwen2-0.5B-Instruct
2,yodayo-ai/holodayo-xl-2.1,Text-to-Image,2.65k,38,yodayo-ai,holodayo-xl-2.1
3,Qwen/Qwen2-57B-A14B-Instruct,Text Generation,20.2k,38,Qwen,Qwen2-57B-A14B-Instruct
4,microsoft/Phi-3-mini-4k-instruct,Text Generation,1.26M,670,microsoft,Phi-3-mini-4k-instruct
...,...,...,...,...,...,...
294,sophosympatheia/Midnight-Miqu-70B-v1.5,Text Generation,9.25k,85,sophosympatheia,Midnight-Miqu-70B-v1.5
295,davidkim205/Rhea-72b-v0.5,Text Generation,4.68k,117,davidkim205,Rhea-72b-v0.5
296,google/gemma-1.1-7b-it,Text Generation,113k,249,google,gemma-1.1-7b-it
297,unum-cloud/uform-gen2-dpo,Image-to-Text,20.3k,35,unum-cloud,uform-gen2-dpo


In [21]:
df['Downloads'] = df['Downloads'].apply(convert_count)

In [22]:
df['Likes'] = df['Likes'].apply(convert_count)
df.head(100)

Unnamed: 0,Model Name,Description,Downloads,Likes,Organization,Model_Name
0,nomic-ai/nomic-embed-vision-v1.5,Image Feature Extraction,614,40,nomic-ai,nomic-embed-vision-v1.5
1,Qwen/Qwen2-0.5B-Instruct,Text Generation,11900,39,Qwen,Qwen2-0.5B-Instruct
2,yodayo-ai/holodayo-xl-2.1,Text-to-Image,2650,38,yodayo-ai,holodayo-xl-2.1
3,Qwen/Qwen2-57B-A14B-Instruct,Text Generation,20200,38,Qwen,Qwen2-57B-A14B-Instruct
4,microsoft/Phi-3-mini-4k-instruct,Text Generation,1260000,670,microsoft,Phi-3-mini-4k-instruct
...,...,...,...,...,...,...
110,Alibaba-NLP/gte-large-en-v1.5,Sentence Similarity,665000,92,Alibaba-NLP,gte-large-en-v1.5
112,gradientai/Llama-3-8B-Instruct-Gradient-1048k,Text Generation,32800,613,gradientai,Llama-3-8B-Instruct-Gradient-1048k
114,defog/llama-3-sqlcoder-8b,Text Generation,12300,91,defog,llama-3-sqlcoder-8b
115,NorwAI/NorwAI-Mistral-7B-instruct,Text Generation,352,11,NorwAI,NorwAI-Mistral-7B-instruct


### Download dataframe as csv file

In [23]:
df.to_csv('model_data.csv', index=False)