In [None]:
!pip install transformers

In [None]:
from transformers import pipeline

In [None]:
#-------------------------------------------------------#
#                NLP TASKS                              #
#-------------------------------------------------------#

'''
1. Text Classification: Assigning a category to a piece of text
Sentiment analysis
Spam detection
'''

classifier = pipeline("text-classification")

'''
2. Token classification: Assigning a label to each token in a sequence
Named entity recognition (NER)
Part-of-speech tagging
'''

token_classifier = pipeline("token-classification")

'''
3. Question answering: Answering a question based on a context
'''

question_answerer = pipeline("question-answering")


'''
4. Text generation: Generating new text based on a given prompt
Language modeling
Story generation
'''

generator = pipeline("text-generation")


'''
5. Summarization: Generating a concise summary of a longer text
'''

summarizer = pipeline("summarization")


'''
5. Translation: Converting text from one language to another
'''

translator = pipeline("translation",model="Helsinki-NLP/opus-mt-en-fr")


'''
6. Text2Text generation: Generating new text based on a given prompt, General purpose text summarization including
translation and summarization
'''

text2text_generator = pipeline("text2text-generation")


'''
7. Fill-mask: Filling in the blanks in a given text, predicting missing words in a sentence
'''

unmasker = pipeline("fill-mask")


'''
8. Feature extraction: Extracting hidden states or features from text.
'''

feature_extractor = pipeline("feature-extraction")


'''
9. Sentence similarity: Measuring the similarity between two sentences
'''

similarity = pipeline("sentence-similarity")



#-------------------------------------------------------#
#                COMPUTER VISION TASKS                  #
#-------------------------------------------------------#

'''
1. Image classification: Classifying the main content of an image
'''

image_classifier = pipeline("image-classification")


'''
2. Object detection: Identifying and localizing objects in an image
'''

object_detector = pipeline("object-detection")


'''
3. Image segmentation: Assigning a label to each pixel in an image
'''

image_segmenter = pipeline("image-segmentation")


'''
4. Image generation: Generating new images based on a given prompt (Using DALL-E or similar models)
'''

image_generator = pipeline("image-generation")


#-------------------------------------------------------#
#                SPEECH PROCESSING TASKS                #
#-------------------------------------------------------#

'''
1. Automatic speech recognition(ASR): Converting spoken language into text
'''
speech_recognizer = pipeline("automatic-speech-recognition")


'''
2. Text-to-speech(TTS): Converting text into spoken language
'''

text_to_speech = pipeline("text-to-speech")

'''
3. Speech translation: Converting speech from one language to another
'''

speech_translator = pipeline("speech-translation")


'''
4. Audio classification: Classifying the main content of an audio file
'''

audio_classifier = pipeline("audio-classification")


'''
5. Audio transcription: Converting spoken language into text
'''
audio_recognizer = pipeline("audio-transcription")


#-------------------------------------------------------#
#                MULTIMODAL TASKS                       #
#-------------------------------------------------------#

'''
1. Image captioning: Generating a descriptive caption for an image
'''

image_captioner = pipeline("image-to-text")


'''
2. Visual question answering (VQA) : Answering a question about an image
'''

visual_question_answerer = pipeline("visual-question-answering")


#-------------------------------------------------------#
#                OTHER TASKS                            #
#-------------------------------------------------------#

'''
1. Table question answering: Answering a question about a table
'''

table_question_answerer = pipeline("table-question-answering")


'''
2. Document question answering: Extracting answers from documents like PDF
'''

document_question_answerer = pipeline("document-question-answering")


'''
3. Time series forecasting: Predicting future values of a time series
'''

time_series_forecaster = pipeline("time-series-forecasting")


'''
4. Anomaly detection: Identifying unusual patterns in a dataset
'''

anomaly_detector = pipeline("anomaly-detection")
'''


# **NLP TASKS**

## **Sentiment analysis**

In [None]:
from transformers import pipeline

In [None]:
classifier = pipeline("sentiment-analysis")

result = classifier("I hate you")[0]
print(f"label: {result['label']}, with score: {round(result['score'], 4)}")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Device set to use cpu


label: NEGATIVE, with score: 0.9991


In [None]:
pipeline(task = 'sentiment-analysis')('I was very confused with the new batman movie')

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


[{'label': 'NEGATIVE', 'score': 0.9988514184951782}]

In [None]:
pipeline(task = 'sentiment-analysis', model = 'facebook/bart-large-mnli')\
                            ('I was very confused with the new batman movie')

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cpu


[{'label': 'neutral', 'score': 0.9624118804931641}]

## **Batch Sentiment Analysis**

In [None]:
classifier = pipeline('sentiment-analysis')

task_list = ['I really like autoencoders, best model for anomaly detection', \
             'I am not sure if we can actually evaluate LLMS.', \
             'PassiveAggressive is the name of linear regression model that so many people dont know.' , \
             'I hate long meetings.']

classifier(task_list)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


[{'label': 'POSITIVE', 'score': 0.9984299540519714},
 {'label': 'NEGATIVE', 'score': 0.9995476603507996},
 {'label': 'NEGATIVE', 'score': 0.997748076915741},
 {'label': 'NEGATIVE', 'score': 0.9969879984855652}]

In [None]:
classifier = pipeline('sentiment-analysis', model='SamLowe/roberta-base-go_emotions')

task_list = ['I really like autoencoders, best model for anomaly detection', \
             'I am not sure if we can actually evaluate LLMS.', \
             'PassiveAggressive is the name of linear regression model that so many people dont know.' , \
             'I hate long meetings.']

classifier(task_list)

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/380 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

Device set to use cpu
  return forward_call(*args, **kwargs)


[{'label': 'admiration', 'score': 0.8294069170951843},
 {'label': 'confusion', 'score': 0.8987987041473389},
 {'label': 'neutral', 'score': 0.7055688500404358},
 {'label': 'anger', 'score': 0.772042453289032}]

## **Text Generation**

In [None]:
from transformers import pipeline

text_generator = pipeline('text-generation' , model = 'distilbert/distilgpt2')
generated_text = text_generator('Today is a rainy day in London',
                                truncation = True,
                                num_return_sequences=2)
print('Generated text:\n', generated_text[0]['generated_text'])

Device set to use cpu
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated text:
 Today is a rainy day in London.


The day after the storm came to an end, the BBC has been reporting that the rain was expected to continue until late Wednesday morning.
The storm, which will be expected to be the heaviest ever recorded in London, was predicted to have hit about 40,000 homes, according to the BBC.
The BBC is reporting that the storm was expected to reach its peak on Thursday night, giving it a 10% chance of hitting the UK on Thursday.
The day after the storm came to an end, the BBC says that the BBC will now report on the storm.
The BBC's meteorologist John Goss, who has been in charge of the BBC, says that the storm had also been "a result of a very strong breeze" and that "the cold wind was mostly quite strong."
"The storm has a very strong wind around the back of the city and the storm has a very strong wind over the city, which will be very strong for the whole of London," he told BBC News.


## **Question Answering**

In [None]:
from transformers import pipeline

qa_model = pipeline('question-answering')
question = 'What is my job?'
context = 'I am developing AI models with Python'
answer = qa_model(question=question , context = context)
answer

No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 564e9b5 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


{'score': 0.8267208337783813,
 'start': 5,
 'end': 25,
 'answer': 'developing AI models'}

## **Tokenization**

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DistilBertTokenizer, DistilBertForSequenceClassification

In [None]:
'''
model_name1 = 'distilbert-base-uncased-finetuned-sst-2-english'
my_tokenizer1 = DistilBertTokenizer.from_pretrained(model_name1)
my_model1 = DistilBertForSequenceClassification.from_pretrained(model_name1)

classifier = pipeline('sentiment-analysis')
res = classifier('I was not so happy with the Batman movie')
res

'''


model_name2 = 'nlptown/bert-base-multilingual-uncased-sentiment'
my_tokenizer2 = AutoTokenizer.from_pretrained(model_name2)
my_model2 = AutoModelForSequenceClassification.from_pretrained(model_name2)

classifier = pipeline('sentiment-analysis' ,model = my_model2 , tokenizer = my_tokenizer2)
res = classifier('I was not so happy with the Batman movie')
res

tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/669M [00:00<?, ?B/s]

Device set to use cpu
  return forward_call(*args, **kwargs)


[{'label': '2 stars', 'score': 0.4681621491909027}]

In [None]:
from transformers import AutoTokenizer

# Load a pre trained tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

# Example text
text = "I was not so happy with the batman movie"

# Tokenize the text
tokens = tokenizer.tokenize(text)

# Print the tokens
print("Tokens: ", tokens)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Tokens:  ['I', 'was', 'not', 'so', 'happy', 'with', 'the', 'bat', '##man', 'movie']


In [None]:
# Convert tokens into input ID's
input_ids = tokenizer.convert_tokens_to_ids(tokens)

# Print the input IDs
print("Input IDs: ", input_ids)

Input IDs:  [146, 1108, 1136, 1177, 2816, 1114, 1103, 7693, 1399, 2523]


In [None]:
# Encode the text (tokenization+converting into Input ID's)
encoded_input = tokenizer(text)

# Print the encoded input
print("Encoded Input: ", encoded_input)


Encoded Input:  {'input_ids': [101, 146, 1108, 1136, 1177, 2816, 1114, 1103, 7693, 1399, 2523, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [None]:
# Decode the text
decoded_output = tokenizer.decode(encoded_input['input_ids'])

# Print the decoded output
print("Decoded Output: ", decoded_output)

Decoded Output:  [CLS] I was not so happy with the batman movie [SEP]


# **Fine Tuning IMDB**

In [1]:
!pip install datasets # if you want to use hugging face datasets so you have to download this library



## **Loading and Preparing the dataset**

In [2]:
from datasets import load_dataset
dataset = load_dataset('imdb')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

## **Pre Processing the data**

In [9]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=64)
    # padding: Matlb pehly maximum length ka sentence dhond kar uske baad baki short sentences ko oske brabar karna hy 0 laga kar

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [5]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 50000
    })
})

## **Setting up the training arguements**

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(output_dir='./results',
         eval_strategy='epoch',
         learning_rate = 2e-5,
         per_device_train_batch_size=16,
         per_device_eval_batch_size=16,
         num_train_epochs=0.3,
         weight_decay=0.01)

training_args


## **Initialize the Model**

In [11]:
# Loading the pre trained model and defining the training process

from transformers import AutoModelForSequenceClassification, Trainer

# load the pre trained model
model = AutoModelForSequenceClassification.from_pretrained('bert-base-cased', num_labels=2) # beacause we are having 2 labels +ve,-ve

# Initialize the Trainer
trainer = Trainer (
    model = model,
    args = training_args,
    train_dataset = tokenized_datasets['train'],
    eval_dataset = tokenized_datasets['test']
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## **Train the model**

In [12]:
trainer.train()

Epoch,Training Loss,Validation Loss
0,No log,0.417272


TrainOutput(global_step=469, training_loss=0.4720841918164479, metrics={'train_runtime': 188.4887, 'train_samples_per_second': 39.79, 'train_steps_per_second': 2.488, 'total_flos': 246798169927680.0, 'train_loss': 0.4720841918164479, 'epoch': 0.3000639795265515})

## **Evaluate The model**

In [13]:
results = trainer.evaluate()
results

{'eval_loss': 0.4172724187374115,
 'eval_runtime': 86.6094,
 'eval_samples_per_second': 288.652,
 'eval_steps_per_second': 18.047,
 'epoch': 0.3000639795265515}

## **Save the fine tuned model**

In [14]:
model.save_pretrained('./fine-tuned-model')
tokenizer.save_pretrained('./fine-tuned-model')

('./fine-tuned-model/tokenizer_config.json',
 './fine-tuned-model/special_tokens_map.json',
 './fine-tuned-model/vocab.txt',
 './fine-tuned-model/added_tokens.json',
 './fine-tuned-model/tokenizer.json')

In [15]:
# Testing the model with a movie review
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
model_path = './fine-tuned-model'

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

classifier = pipeline('sentiment-analysis', model = model, tokenizer = tokenizer)

review = "The movie was absolutely fantastic, I loved the story and acting!"
result = classifier(review)

print(result)

Device set to use cuda:0


[{'label': 'LABEL_1', 'score': 0.9710847735404968}]
