In [None]:
!pip install transformers



# obtain a list of the top-20 "Text Classification" and top-20 "Text Generation" models on https://huggingface.co/, ranked based on popularity

In [None]:
import csv
from huggingface_hub import HfApi

# Function to get all models with a specific tag
def get_models_by_tag(tag):
    hf_api = HfApi(endpoint="https://huggingface.co")
    all_models = hf_api.list_models()
    tag_models = [model for model in all_models if tag.lower() in [model_tag.lower() for model_tag in model.tags]]
    return tag_models

# Get all "text classification" models
text_generation_models = get_models_by_tag("text-generation")

# Sort models by downloads in descending order
text_generation_models_sorted = sorted(text_generation_models, key=lambda x: x.downloads, reverse=True)

# Prepare data for CSV
csv_data = [["Rank", "Model ID", "Downloads"]]
for i, model in enumerate(text_generation_models_sorted[:20]):
    csv_data.append([i + 1, model.modelId, model.downloads])

# Save to CSV file
csv_file_path = "top_text_generation_models.csv"
with open(csv_file_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(csv_data)

In [None]:
# Now model_names contains the names of the first 10 text classification models
print("Model names array:", text_generation_model_names)

Model names array: ['distilgpt2', 'gpt2', 'davidkim205/komt-mistral-7b-v1', 'tiiuae/falcon-40b-instruct', 'bigscience/bloom-560m', 'meta-llama/Llama-2-7b-chat-hf', 'petals-team/StableBeluga2', 'facebook/opt-1.3b', 'HuggingFaceM4/tiny-random-LlamaForCausalLM', 'TheBloke/CodeLlama-34B-Instruct-GPTQ', 'facebook/opt-125m', 'microsoft/git-base', 'mistralai/Mistral-7B-Instruct-v0.1', 'mistralai/Mistral-7B-v0.1', 'meta-llama/Llama-2-7b-hf', 'TheBloke/Llama-2-7B-Chat-GPTQ', 'gpt2-medium', 'gpt2-large', 'NousResearch/Nous-Hermes-Llama2-13b', 'ehartford/Samantha-1.11-70b']


 # obtain and compare the number of ML apps ("spaces") for each "Text Classification" and "Text Generation" model obtained in step 1

In [None]:
!pip install requests




In [None]:
!pip install requests beautifulsoup4




## obtain and compare the number of ML apps ("spaces") for each "Text Classification"

In [None]:
import requests
from bs4 import BeautifulSoup
from huggingface_hub import HfApi

# Function to get the number of articles for a model
def get_num_articles(classification_model_name):
    url = f"https://huggingface.co/spaces?sort=likes&search={classification_model_name}"
    response = requests.get(url)

    if response.status_code == 200 and 'text/html' in response.headers['content-type']:
        try:
            html_content = response.text
            soup = BeautifulSoup(html_content, 'html.parser')

            # Find the div with class "grid grid-cols-1 gap-x-4 gap-y-6 md:grid-cols-3 xl:grid-cols-4"
            class_div = soup.find('div', class_='grid grid-cols-1 gap-x-4 gap-y-6 md:grid-cols-3 xl:grid-cols-4')

            if class_div:
                # Find all articles under the div
                articles = class_div.find_all('article', class_='')

                # Return the number of articles
                return len(articles)
            else:
                return 0
        except Exception as e:
            print(f"Error: {e}")
            return 0
    else:
        print(f"Error: {response.status_code} - {response.text}")
        return 0

# Function to get all models with a specific tag
def get_models_by_tag(tag):
    hf_api = HfApi(endpoint="https://huggingface.co")
    all_models = hf_api.list_models()
    tag_models = [model for model in all_models if tag.lower() in [model_tag.lower() for model_tag in model.tags]]
    return tag_models

# Get all "text classification" models
text_classification_models = get_models_by_tag("text-classification")

# Sort models by downloads in descending order
text_classification_models_sorted = sorted(text_classification_models, key=lambda x: x.downloads, reverse=True)

# Save model names in an array
classification_model_names = []

# Display information about "text classification" models and get the number of articles for each model
for i, model in enumerate(text_classification_models_sorted[:20]):
    classification_model_names.append(model.modelId)
    modified_url = f"https://huggingface.co/spaces?sort=likes&search={model.modelId}"

    # Fetch the number of articles for the current model
    num_articles = get_num_articles(model.modelId)

    print(f"{i+1}. {model.modelId} - Downloads: {model.downloads}")
    print(f"   Tags: {', '.join(model.tags)}")
    print(f"   Last Modified: {model.lastModified}")
    print(f"   Number of apps: {num_articles}\n")

# Now model_names contains the names of the first 10 text classification models
print("Model names array:", classification_model_names)


1. distilbert-base-uncased-finetuned-sst-2-english - Downloads: 8612633
   Tags: transformers, pytorch, tf, rust, onnx, safetensors, distilbert, text-classification, en, dataset:sst2, dataset:glue, arxiv:1910.01108, doi:10.57967/hf/0181, license:apache-2.0, model-index, autotrain_compatible, endpoints_compatible, has_space, region:us
   Last Modified: None
   Number of apps: 16

2. cardiffnlp/twitter-roberta-base-irony - Downloads: 7199260
   Tags: transformers, pytorch, tf, jax, roberta, text-classification, en, dataset:tweet_eval, arxiv:2010.12421, autotrain_compatible, endpoints_compatible, has_space, region:us
   Last Modified: None
   Number of apps: 2

3. lxyuan/distilbert-base-multilingual-cased-sentiments-student - Downloads: 7116579
   Tags: transformers, pytorch, safetensors, distilbert, text-classification, sentiment-analysis, zero-shot-distillation, distillation, zero-shot-classification, debarta-v3, en, ar, de, es, fr, ja, zh, id, hi, it, ms, pt, dataset:tyqiangz/multiling

## obtain and compare the number of ML apps ("spaces") for each "Text Generation"

In [None]:
import requests
from bs4 import BeautifulSoup
from huggingface_hub import HfApi

# Function to get the number of articles for a model
def get_num_articles(generative_model_name):
    url = f"https://huggingface.co/spaces?sort=likes&search={generative_model_name}"
    response = requests.get(url)

    if response.status_code == 200 and 'text/html' in response.headers['content-type']:
        try:
            html_content = response.text
            soup = BeautifulSoup(html_content, 'html.parser')

            # Find the div with class "grid grid-cols-1 gap-x-4 gap-y-6 md:grid-cols-3 xl:grid-cols-4"
            class_div = soup.find('div', class_='grid grid-cols-1 gap-x-4 gap-y-6 md:grid-cols-3 xl:grid-cols-4')

            if class_div:
                # Find all articles under the div
                articles = class_div.find_all('article', class_='')

                # Return the number of articles
                return len(articles)
            else:
                return 0
        except Exception as e:
            print(f"Error: {e}")
            return 0
    else:
        print(f"Error: {response.status_code} - {response.text}")
        return 0

# Function to get all models with a specific tag
def get_models_by_tag(tag):
    hf_api = HfApi(endpoint="https://huggingface.co")
    all_models = hf_api.list_models()
    tag_models = [model for model in all_models if tag.lower() in [model_tag.lower() for model_tag in model.tags]]
    return tag_models

# Get all "text generation" models
text_generation_models = get_models_by_tag("text-generation")

# Sort models by downloads in descending order
text_generation_models_sorted = sorted(text_generation_models, key=lambda x: x.downloads, reverse=True)

# Save model names in an array
generative_model_names = []

# Display information about "text generation" models and get the number of articles for each model
for i, model in enumerate(text_generation_models_sorted[:20]):
    generative_model_names.append(model.modelId)
    modified_url = f"https://huggingface.co/spaces?sort=likes&search={model.modelId}"

    # Fetch the number of articles for the current model
    num_articles = get_num_articles(model.modelId)

    print(f"{i+1}. {model.modelId} - Downloads: {model.downloads}")
    print(f"   Tags: {', '.join(model.tags)}")
    print(f"   Last Modified: {model.lastModified}")
    print(f"   Number of apps: {num_articles}\n")

# Now model_names contains the names of the first 10 text generation models
print("Model names array:", generative_model_names)


1. distilgpt2 - Downloads: 33709069
   Tags: transformers, pytorch, tf, jax, tflite, rust, coreml, safetensors, gpt2, text-generation, exbert, en, dataset:openwebtext, arxiv:1910.01108, arxiv:2201.08542, arxiv:2203.12574, arxiv:1910.09700, arxiv:1503.02531, license:apache-2.0, model-index, co2_eq_emissions, autotrain_compatible, endpoints_compatible, has_space, text-generation-inference, region:us
   Last Modified: None
   Number of apps: 24

2. gpt2 - Downloads: 16197257
   Tags: transformers, pytorch, tf, jax, tflite, rust, onnx, safetensors, gpt2, text-generation, exbert, en, doi:10.57967/hf/0039, license:mit, autotrain_compatible, endpoints_compatible, has_space, text-generation-inference, region:us
   Last Modified: None
   Number of apps: 24

3. davidkim205/komt-mistral-7b-v1 - Downloads: 2913985
   Tags: transformers, pytorch, mistral, text-generation, finetuned, en, ko, arxiv:2308.06502, arxiv:2308.06259, autotrain_compatible, endpoints_compatible, has_space, text-generation-in

In [None]:
generative_model_names

['distilgpt2',
 'gpt2',
 'davidkim205/komt-mistral-7b-v1',
 'tiiuae/falcon-40b-instruct',
 'bigscience/bloom-560m',
 'meta-llama/Llama-2-7b-chat-hf',
 'petals-team/StableBeluga2',
 'facebook/opt-1.3b',
 'HuggingFaceM4/tiny-random-LlamaForCausalLM',
 'TheBloke/CodeLlama-34B-Instruct-GPTQ',
 'facebook/opt-125m',
 'microsoft/git-base',
 'mistralai/Mistral-7B-Instruct-v0.1',
 'mistralai/Mistral-7B-v0.1',
 'meta-llama/Llama-2-7b-hf',
 'TheBloke/Llama-2-7B-Chat-GPTQ',
 'gpt2-medium',
 'gpt2-large',
 'NousResearch/Nous-Hermes-Llama2-13b',
 'ehartford/Samantha-1.11-70b']

## csv for classification model

## csv for generative model

In [None]:
import csv
import requests
from bs4 import BeautifulSoup
from huggingface_hub import HfApi

# ... (your existing code)

# Create a list to store model names and associated apps
generative_model_apps_list = []

# Function to get the combined names for all apps associated with a model
def get_combined_names_for_model(generative_model_name):
    url = f"https://huggingface.co/spaces?sort=likes&search={generative_model_name}"
    response = requests.get(url)

    if response.status_code == 200 and 'text/html' in response.headers['content-type']:
        try:
            html_content = response.text
            soup = BeautifulSoup(html_content, 'html.parser')

            # Find all h4 elements with class "z-40 max-w-full truncate text-center font-bold leading-tight text-blue-50 text-base"
            h4_elements = soup.find_all('h4', class_='z-40 max-w-full truncate text-center font-bold leading-tight text-blue-50 text-base')

            # Find all a elements with class "truncate font-mono text-sm text-black"
            a_elements = soup.find_all('a', class_='truncate font-mono text-sm text-black')

            generative_combined_names = []

            # Iterate through each pair of h4 and a elements and combine the names
            for h4_element, a_element in zip(h4_elements, a_elements):
                app_name = h4_element.text.strip()
                creator_name = a_element.text.strip()
                generative_combined_name = f"{creator_name}/{app_name}"
                generative_combined_names.append(generative_combined_name)

            return generative_combined_names
        except Exception as e:
            print(f"Error: {e}")
            return None
    else:
        print(f"Error: {response.status_code} - {response.text}")
        return None

# Iterate through each model and get combined names for all apps
for generative_model_name in generative_model_names:
    generative_combined_names = get_combined_names_for_model(generative_model_name)
    if generative_combined_names:
        for generative_combined_name in generative_combined_names:
            generative_model_apps_list.append({'generative_Model': generative_model_name, 'Combined Name': generative_combined_name})

# Save the data to a CSV file
csv_file_path = 'generative_model_apps.csv'
fieldnames = ['generative_Model', 'Combined Name']

with open(csv_file_path, mode='w', newline='') as csv_file:
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(generative_model_apps_list)

print(f"CSV file '{csv_file_path}' has been created with model names and combined names.")


CSV file 'generative_model_apps.csv' has been created with model names and combined names.


In [None]:
import csv
import requests
from bs4 import BeautifulSoup
from huggingface_hub import HfApi

# Function to get the number of articles for a model
def get_num_articles(generative_model_name):
    url = f"https://huggingface.co/spaces?sort=likes&search={generative_model_name}"
    response = requests.get(url)

    if response.status_code == 200 and 'text/html' in response.headers['content-type']:
        try:
            html_content = response.text
            soup = BeautifulSoup(html_content, 'html.parser')

            # Find the div with class "grid grid-cols-1 gap-x-4 gap-y-6 md:grid-cols-3 xl:grid-cols-4"
            class_div = soup.find('div', class_='grid grid-cols-1 gap-x-4 gap-y-6 md:grid-cols-3 xl:grid-cols-4')

            if class_div:
                # Find all articles under the div
                articles = class_div.find_all('article', class_='')

                # Return the number of articles
                return len(articles)
            else:
                return 0
        except Exception as e:
            print(f"Error: {e}")
            return 0
    else:
        print(f"Error: {response.status_code} - {response.text}")
        return 0

# Function to get all models with a specific tag
def get_models_by_tag(tag):
    hf_api = HfApi(endpoint="https://huggingface.co")
    all_models = hf_api.list_models()
    tag_models = [model for model in all_models if tag.lower() in [model_tag.lower() for model_tag in model.tags]]
    return tag_models

# Create a list to store model and app details
model_app_details_list = []

# Function to get the combined names for all apps associated with a model
def get_combined_names_for_model(generative_model_name):
    url = f"https://huggingface.co/spaces?sort=likes&search={generative_model_name}"
    response = requests.get(url)

    if response.status_code == 200 and 'text/html' in response.headers['content-type']:
        try:
            html_content = response.text
            soup = BeautifulSoup(html_content, 'html.parser')

            # Find all h4 elements with class "z-40 max-w-full truncate text-center font-bold leading-tight text-blue-50 text-base"
            h4_elements = soup.find_all('h4', class_='z-40 max-w-full truncate text-center font-bold leading-tight text-blue-50 text-base')

            # Find all a elements with class "truncate font-mono text-sm text-black"
            a_elements = soup.find_all('a', class_='truncate font-mono text-sm text-black')

            generative_combined_names = []

            # Iterate through each pair of h4 and a elements and combine the names
            for h4_element, a_element in zip(h4_elements, a_elements):
                app_name = h4_element.text.strip()
                creator_name = a_element.text.strip()
                generative_combined_names.append({'App ID': creator_name, 'App Name': app_name})

            return generative_combined_names
        except Exception as e:
            print(f"Error: {e}")
            return None
    else:
        print(f"Error: {response.status_code} - {response.text}")
        return None

# Get all "text classification" models
text_generative_models = get_models_by_tag("text-generation")

# Sort models by downloads in descending order
text_generative_models_sorted = sorted(text_generative_models, key=lambda x: x.downloads, reverse=True)

# Iterate through each model and get combined names for all apps
for model in text_generative_models_sorted[:20]:
    model_id = model.modelId
    model_name = model.modelId
    num_articles = get_num_articles(model.modelId)
    combined_names = get_combined_names_for_model(model.modelId)

    if combined_names:
        for app_details in combined_names:
            app_details.update({'Model ID': model_id, 'Model Name': model_name})
            model_app_details_list.append(app_details)

# Save the data to a CSV file
csv_file_path = 'generative_model_app_details.csv'
fieldnames = ['Model ID', 'Model Name', 'App ID', 'App Name']

with open(csv_file_path, mode='w', newline='') as csv_file:
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(model_app_details_list)

print(f"CSV file '{csv_file_path}' has been created with model and app details.")


CSV file 'generative_model_app_details.csv' has been created with model and app details.


# obtain and compare the source code size of the ML apps ("spaces") obtained in step 2 (HINT: check the "Files" tab at the top-right of a given space's page)

## classification model apps size

In [None]:
import csv
import requests
from bs4 import BeautifulSoup

# Function to extract size from HTML content
def get_size_from_html(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    byte_sum = 0
    for tag in soup.find_all(text=True):
        if 'byte' in tag.lower():
            try:
                byte_value = int(tag.split()[0])
                byte_sum += byte_value
            except ValueError:
                pass
    return byte_sum

# Function to fetch HTML content of a given URL
def get_html_content(url):
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        return None

# Read CSV file
csv_file_path = 'classification_model_apps.csv'  # Replace with the path to your CSV file
output_csv_file_path = 'classification_app_size.csv'  # New CSV file for app sizes

with open(csv_file_path, 'r') as file, open(output_csv_file_path, 'w', newline='') as output_file:
    # Create CSV writer
    fieldnames = ['classification_Model', 'Combined Name', 'App Size (bytes)']
    writer = csv.DictWriter(output_file, fieldnames=fieldnames)

    # Write header to the output file
    writer.writeheader()

    reader = csv.DictReader(file)

    # Iterate through each row in the CSV
    for row in reader:
        model_name = row['classification_Model']
        combined_name = row['Combined Name']

        # Replace spaces with dashes
        formatted_combined_name = combined_name.replace(' ', '-')

        # Generate the URL
        url = f'https://huggingface.co/spaces/{formatted_combined_name}/tree/main'
        # print(url)

        # Fetch HTML content
        html_content = get_html_content(url)

        if html_content is not None:
            # Extract size from HTML
            app_size = get_size_from_html(html_content)

            # Write to the output file
            writer.writerow({'classification_Model': model_name, 'Combined Name': combined_name, 'App Size (bytes)': app_size})

            # Print or store the result as needed
            print(f'classification_Model: {model_name}, Combined Name: {combined_name}, App Size: {app_size} bytes')
        else:
            print(f'Error fetching content for Model: {model_name}, Combined Name: {combined_name}')


  for tag in soup.find_all(text=True):


classification_Model: distilbert-base-uncased-finetuned-sst-2-english, Combined Name: DogManTC/Distilbert Base Uncased Finetuned Sst 2 English, App Size: 1291 bytes
classification_Model: distilbert-base-uncased-finetuned-sst-2-english, Combined Name: cmorato/Distilbert Base Uncased Finetuned Sst 2 English, App Size: 370 bytes
classification_Model: distilbert-base-uncased-finetuned-sst-2-english, Combined Name: rcuchoa/Distilbert Base Uncased Finetuned Sst 2 English, App Size: 393 bytes
classification_Model: distilbert-base-uncased-finetuned-sst-2-english, Combined Name: leolinardi/Distilbert Base Uncased Finetuned Sst 2 English, App Size: 373 bytes
classification_Model: distilbert-base-uncased-finetuned-sst-2-english, Combined Name: Offdutyninja/Distilbert Base Uncased Finetuned Sst 2 English, App Size: 373 bytes
classification_Model: distilbert-base-uncased-finetuned-sst-2-english, Combined Name: Jayod/Distilbert Base Uncased Finetuned Sst 2 English, App Size: 375 bytes
classification

## generative model apps size

In [None]:
import csv
import requests
from bs4 import BeautifulSoup

# Function to extract size from HTML content
def get_size_from_html(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    byte_sum = 0
    for tag in soup.find_all(text=True):
        if 'byte' in tag.lower():
            try:
                byte_value = int(tag.split()[0])
                byte_sum += byte_value
            except ValueError:
                pass
    return byte_sum

# Function to fetch HTML content of a given URL
def get_html_content(url):
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        return None

# Read CSV file
csv_file_path = 'generative_model_apps.csv'  # Replace with the path to your CSV file
output_csv_file_path = 'generative_app_size.csv'  # New CSV file for app sizes

with open(csv_file_path, 'r') as file, open(output_csv_file_path, 'w', newline='') as output_file:
    # Create CSV writer
    fieldnames = ['generative_Model', 'Combined Name', 'App Size (bytes)']
    writer = csv.DictWriter(output_file, fieldnames=fieldnames)

    # Write header to the output file
    writer.writeheader()

    reader = csv.DictReader(file)

    # Iterate through each row in the CSV
    for row in reader:
        model_name = row['generative_Model']
        combined_name = row['Combined Name']

        # Replace spaces with dashes
        formatted_combined_name = combined_name.replace(' ', '-')

        # Generate the URL
        url = f'https://huggingface.co/spaces/{formatted_combined_name}/tree/main'
        print(url)

        # Fetch HTML content
        html_content = get_html_content(url)

        if html_content is not None:
            # Extract size from HTML
            app_size = get_size_from_html(html_content)

            # Write to the output file
            writer.writerow({'generative_Model': model_name, 'Combined Name': combined_name, 'App Size (bytes)': app_size})

            # Print or store the result as needed
            print(f'generative_Model: {model_name}, Combined Name: {combined_name}, App Size: {app_size} bytes')
        else:
            print(f'Error fetching content for Model: {model_name}, Combined Name: {combined_name}')


https://huggingface.co/spaces/Norod78/Shibing624-Code-Autocomplete-Distilgpt2-Python/tree/main
Error fetching content for Model: distilgpt2, Combined Name: Norod78/Shibing624 Code Autocomplete Distilgpt2 Python
https://huggingface.co/spaces/ICML2022/DunnBC22-distilgpt2-2k-Clean-Medical-Articles-Causal-Language-Model/tree/main
Error fetching content for Model: distilgpt2, Combined Name: ICML2022/DunnBC22-distilgpt2-2k Clean Medical Articles Causal Language Model
https://huggingface.co/spaces/Rexuint/Aman-mehra-gpt2-medium-finetune-squad-ep-2.5-lr-3e-06-wd-0.0002-glb-Sd-1-data-Sd-0/tree/main
Error fetching content for Model: gpt2-medium, Combined Name: Rexuint/Aman-mehra-gpt2-medium-finetune-squad-ep-2.5-lr-3e-06-wd-0.0002-glb Sd-1-data Sd-0
https://huggingface.co/spaces/heegyu/Caseyhahn-Gpt2-Medium-Finetuned-Genius-Lyrics-Updated-Data-Large/tree/main
Error fetching content for Model: gpt2-medium, Combined Name: heegyu/Caseyhahn Gpt2 Medium Finetuned Genius Lyrics Updated Data Large
http