In [1]:
import pandas as pd
from transformers.utils import logging
logging.set_verbosity_error()

In [2]:
from huggingface_hub import HfApi

### Code to find latest models by most downlaoded. Find model and saving that model

In [3]:
# Import the necessary class from Hugging Face Hub
from huggingface_hub import HfApi

# Create an instance of the API
api = HfApi()

# Return the filtered list from the Hub using direct parameters
models = api.list_models(
    task="text-classification",  # Directly specify the task
    sort="downloads",            # Sort by the number of downloads
    direction=-1,                # Sort in descending order
    limit=1                      # Limit the result to the top model
)

# Store as a list
model_list = list(models)

# Print the result to see the top model
print(model_list)


[ModelInfo(id='1231czx/llama3_it_ultra_list_and_bold500', author=None, sha=None, created_at=datetime.datetime(2024, 9, 3, 12, 55, 17, tzinfo=datetime.timezone.utc), last_modified=None, private=False, disabled=None, downloads=25854145, downloads_all_time=None, gated=None, gguf=None, inference=None, likes=2, library_name='transformers', tags=['transformers', 'safetensors', 'llama', 'text-classification', 'arxiv:1910.09700', 'autotrain_compatible', 'text-generation-inference', 'endpoints_compatible', 'region:us'], pipeline_tag='text-classification', mask_token=None, card_data=None, widget_data=None, model_index=None, config=None, transformers_info=None, trending_score=None, siblings=None, spaces=None, safetensors=None)]


In [None]:
from transformers import AutoModel

# Define the model ID
modelId = "1231czx/llama3_it_ultra_list_and_bold500"

# Load the model from the Hugging Face Hub
model = AutoModel.from_pretrained(modelId)

# Define the save directory
save_directory = f"models/{modelId.replace('/', '_')}"

# Save the model to the specified directory
model.save_pretrained(save_directory)

### Datasets: 

#### Hugging Face built the dataset package for interacting with datasets. There are a lot of convenient functions, including load_dataset_builder which we just used. After inspecting a dataset to ensure its the right one for your project, it's time to load the dataset! For this, we can leverage input parameters for load_dataset to specify which parts of a dataset to load, i.e. the "train" dataset for English wikipedia articles.

The load_dataset module from the datasets package is already loaded for you. Note: the load_dataset function was modified for the purpose of this exercise.


In [7]:
# Load the module
from datasets import load_dataset_builder, load_dataset

# Create the dataset builder
reviews_builder = load_dataset_builder("derenrich/wikidata-en-descriptions-small")

# Print the features
print(reviews_builder.info.features) ## these are columns


# Load the train portion of the dataset
wikipedia = load_dataset("derenrich/wikidata-en-descriptions-small", split="train")

print(f"The length of the dataset is {len(wikipedia)}")

{'output': Value(dtype='string', id=None), 'qid': Value(dtype='string', id=None), 'name': Value(dtype='string', id=None), 'input': Value(dtype='string', id=None), 'instruction': Value(dtype='string', id=None), 'text': Value(dtype='string', id=None)}


The length of the dataset is 1560888


In [8]:
# Filter the documents
filtered = wikipedia.filter(lambda row: "football" in row["text"])

# Create a sample dataset
example = filtered.select(range(1))

print(example[0]["text"])

Instruction: Produce a short Wikidata description for this Wikipedia article ### Input: The 1964–65 DDR-Oberliga was the 16th season of the DDR-Oberliga, the first tier of league football in East Germany. The league was contested by fourteen teams. National People's Army club ASK Vorwärts Berlin won the championship, the club's fourth of six national East German championships all up. Bernd Bauchspieß of BSG Chemie Leipzig was the league's top scorer with 14 goals, becoming the first player to finish as top scorer on three occasions. For the third time the title East German Footballer of the year was awarded, going to Horst Weigang of SC Leipzig. On the strength of the 1964–65 title Vorwärts qualified for the 1965–66 European Cup where the club was knocked out by Manchester United in the first round. Seventh-placed club SC Aufbau Magdeburg qualified for the 1965–66 European Cup Winners' Cup as the seasons FDGB-Pokal winner and was knocked out by West Ham United in the quarter-finals. Fo

In [9]:
# Print specific columns for all rows
for row in filtered:
    print(f"QID: {row['qid']}, Name: {row['name']}, Text: {row['text']}")
    break


QID: Q563628, Name: 1964–65 DDR-Oberliga, Text: Instruction: Produce a short Wikidata description for this Wikipedia article ### Input: The 1964–65 DDR-Oberliga was the 16th season of the DDR-Oberliga, the first tier of league football in East Germany. The league was contested by fourteen teams. National People's Army club ASK Vorwärts Berlin won the championship, the club's fourth of six national East German championships all up. Bernd Bauchspieß of BSG Chemie Leipzig was the league's top scorer with 14 goals, becoming the first player to finish as top scorer on three occasions. For the third time the title East German Footballer of the year was awarded, going to Horst Weigang of SC Leipzig. On the strength of the 1964–65 title Vorwärts qualified for the 1965–66 European Cup where the club was knocked out by Manchester United in the first round. Seventh-placed club SC Aufbau Magdeburg qualified for the 1965–66 European Cup Winners' Cup as the seasons FDGB-Pokal winner and was knocked 

### Pipeline

In [11]:
# Import the pipeline function from the transformers library
from transformers import pipeline

# Create a sentiment-analysis pipeline using the default model
task_pipeline = pipeline(task="sentiment-analysis")

# Create a sentiment-analysis pipeline using a specific pre-trained model
model_pipeline = pipeline(model="distilbert-base-uncased-finetuned-sst-2-english")

# Define the input text
input_text = "I love using the Hugging Face library!"

# Predict the sentiment using the default sentiment-analysis pipeline
task_output = task_pipeline(input_text)

# Predict the sentiment using the specific model pipeline
model_output = model_pipeline(input_text)

# Print the outputs
print(f"Default Sentiment Analysis Output: {task_output}")
print(f"Model-Specific Sentiment Analysis Output: {model_output}")


Default Sentiment Analysis Output: [{'label': 'POSITIVE', 'score': 0.9992846846580505}]
Model-Specific Sentiment Analysis Output: [{'label': 'POSITIVE', 'score': 0.9992846846580505}]


In [12]:
# Create a named entity recognition pipeline
ner_pipeline = pipeline("ner", grouped_entities=True)

# Analyze named entities in a sentence
result = ner_pipeline("Hugging Face Inc. is a company based in New York.")
print(result)

config.json:   0%|          | 0.00/998 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]



[{'entity_group': 'ORG', 'score': np.float32(0.994665), 'word': 'Hugging Face Inc', 'start': 0, 'end': 16}, {'entity_group': 'LOC', 'score': np.float32(0.9983915), 'word': 'New York', 'start': 40, 'end': 48}]


In [4]:
# Import necessary libraries from transformers
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline

# Download the model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

# Create the sentiment-analysis pipeline using the specified model and tokenizer
sentiment_analysis = pipeline(task="sentiment-analysis", model=model, tokenizer=tokenizer)

# Define the input text
input_text = "I love using the Hugging Face library!"

# Predict the sentiment
output = sentiment_analysis(input_text)

# Print the sentiment label
print(f"Sentiment using AutoClasses: {output[0]['label']}")


Sentiment using AutoClasses: POSITIVE




In [2]:
# Import the AutoTokenizer
from transformers import AutoTokenizer

# Download the tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Normalize the input string
output = tokenizer.backend_tokenizer.normalizer.normalize_str("How are you???")

print(output)

how are you???


In [4]:
# Import necessary classes from the transformers library
from transformers import GPT2Tokenizer, DistilBertTokenizer

# Download the GPT tokenizer
gpt_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Define your input text
input_text = "Pineapple on pizza is pretty good, I guess."

# Tokenize the input using GPT-2 tokenizer
gpt_tokens = gpt_tokenizer.tokenize(text=input_text)

# Repeat for DistilBERT
distil_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
distil_tokens = distil_tokenizer.tokenize(text=input_text)

# Compare the output
print(f"GPT tokenizer: {gpt_tokens}")
print(f"DistilBERT tokenizer: {distil_tokens}")


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]



GPT tokenizer: ['P', 'ine', 'apple', 'Ġon', 'Ġpizza', 'Ġis', 'Ġpretty', 'Ġgood', ',', 'ĠI', 'Ġguess', '.']
DistilBERT tokenizer: ['pine', '##apple', 'on', 'pizza', 'is', 'pretty', 'good', ',', 'i', 'guess', '.']


In [None]:
# Import the pipeline function from the transformers library
from transformers import pipeline

# Create a text classification pipeline using the specified model
classifier = pipeline(
    task="text-classification", 
    model="abdulmatinomotoso/English_Grammar_Checker"
)

# Predict the classification for the input text
output = classifier("I will walk dog")

# Print the output
print(output)


In [2]:
# Import the pipeline function from the transformers library
from transformers import pipeline

# Define the input text
text = "The government is planning to introduce new regulations to control the spread of misinformation."

# Build the zero-shot classifier
classifier = pipeline(task="zero-shot-classification", model="facebook/bart-large-mnli")

# Create the list of candidate labels
candidate_labels = ["politics", "science", "sports"]

# Predict the output
output = classifier(text, candidate_labels)

# Print the top label and its score
print(f"Top Label: {output['labels'][0]} with score: {output['scores'][0]:.4f}")


config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



Top Label: politics with score: 0.7835


In [1]:
# Import the pipeline function from the transformers library
from transformers import pipeline

# Define the text to be summarized
original_text = """
The solar system consists of the Sun and the objects that orbit it, including eight planets, their moons, and other non-stellar objects. The Sun is the central object, providing light and heat necessary for life on Earth. The planets, in order of their distance from the Sun, are Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, and Neptune. Each planet has its own unique characteristics and moons, with Jupiter and Saturn being the largest and having the most moons.
"""

# Create the summarization pipeline
summarizer = pipeline(task="summarization", model="cnicu/t5-small-booksum")

# Summarize the text
summary_text = summarizer(original_text)

# Compare the length of the original and summarized texts
print(f"Original text length: {len(original_text)}")
print(f"Summary length: {len(summary_text[0]['summary_text'])}")

# Print the summarized text
print(f"Summary: {summary_text[0]['summary_text']}")


config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Your max_length is set to 200, but your input_length is only 113. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=56)


Original text length: 475
Summary length: 304
Summary: the solar system consists of the Sun and the objects that orbit it, including eight planets, their moons, and other non-stellar objects. The Sun is the central object, providing light and heat necessary for life on Earth. The planets are Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, and Neptune.


In [2]:
# Import the pipeline function from the transformers library
from transformers import pipeline

# Define the original text to be summarized
original_text = """
The solar system consists of the Sun and the objects that orbit it, including eight planets, their moons, and other non-stellar objects. 
The Sun is the central object, providing light and heat necessary for life on Earth. 
The planets, in order of their distance from the Sun, are Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, and Neptune. 
Each planet has its own unique characteristics and moons, with Jupiter and Saturn being the largest and having the most moons.
"""

# Create a short summarizer with a maximum length of 10 tokens
short_summarizer = pipeline(task="summarization", model="cnicu/t5-small-booksum", min_length=1, max_length=10)

# Generate a short summary
short_summary_text = short_summarizer(original_text)

# Print the short summary
print("Short Summary:")
print(short_summary_text[0]["summary_text"])

# Create a long summarizer with a length between 50 and 150 tokens
long_summarizer = pipeline(task="summarization", model="cnicu/t5-small-booksum", min_length=50, max_length=150)

# Generate a long summary
long_summary_text = long_summarizer(original_text)

# Print the long summary
print("\nLong Summary:")
print(long_summary_text[0]["summary_text"])


Short Summary:
the solar system consists of the Sun and


Your max_length is set to 150, but your input_length is only 113. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=56)



Long Summary:
the solar system consists of the Sun and the objects that orbit it, including eight planets, their moons, and other non-stellar objects. The Sun is the central object, providing light and heat necessary for life on Earth. The planets are Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, and Neptune.
