<a href="https://colab.research.google.com/github/ravinnd3/Generative-AI-Full-Course/blob/main/HuggingFace_Demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!nvidia-smi

In [None]:
!pip install transformers

In [None]:
from transformers import pipeline

In [None]:
classifier = pipeline("sentiment-analysis")
results = classifier(["We are very happy to introduce pipeline to the transformers repository."])
for result in results:
    print(f"label: {result['label']}, with score: {round(result['score'], 4)}")

In [None]:
pipeline(task="sentiment-analysis")("I was confused with Barbie movie")

### Tokenization


In [None]:
from transformers import AutoTokenizer,AutoModelForSequenceClassification, DistilBertTokenizer, DistilBertForSequenceClassification

In [None]:
model = 'distilbert/distilbert-base-uncased-finetuned-sst-2-english'
mymodel = AutoModelForSequenceClassification.from_pretrained(model)
mytokenizer = AutoTokenizer.from_pretrained(model)

In [None]:
clasifier = pipeline("sentiment-analysis",model=mymodel,tokenizer=mytokenizer)
res = clasifier("I was confused with Barbie movie")
print(res)

### Visualizing vector generated by tokeniser from tranformer model

In [None]:
from transformers import AutoTokenizer

# Load a pre-trained model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")


# Exmaple text
text = "I was not so happy with Barbie Movie"

#Tokenise the text
tokens = tokenizer.tokenize(text)

# Print the tokens
print(tokens)



In [None]:
#convert tokens to input IDs
input_ids = tokenizer.convert_tokens_to_ids(tokens)

# Print the input IDs
print("input_ids", input_ids)

In [None]:
# Encode the text (Tokenise + converting to input IDs)

input_ids = tokenizer(text)

# Print the input IDs
print("input_ids", input_ids)

In [None]:
# Decode the text
decoded_text = tokenizer.decode(input_ids)

# Print the decoded text
print("Decoded Text:", decoded_text)

### Finetuning IMDB Datasets from Hugging Face

In [None]:
!pip install datasets

In [None]:
from datasets import load_dataset
dataset = load_dataset("imdb")

In [None]:
dataset

## Preprocess the Data

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


#Tokenise the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)



In [None]:
tokenized_datasets

In [None]:
tokenized_datasets['train'][1]

# Setup the training Arguments

In [None]:
%pwd

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./content/results",   # output directory
    eval_strategy="epoch", # Evaluate every Rate
    learning_rate = 2e-5, # Learning Rate
    per_device_train_batch_size=16 , #Batch Size for tranining
    per_device_eval_batch_size=16,   #Batch size for Evaluation
    num_train_epochs=3,  # Number of training epochs
    weight_decay=0.01)   # Strength of weight decay

training_args

### Initialise the model

In [None]:
from transformers import AutoModelForSequenceClassification, Trainer

#Load the pertrainde model
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Initialsize the training model
trainer = Trainer(model=model,
                  args=training_args,
                  train_dataset=tokenized_datasets["train"],
                  eval_dataset=tokenized_datasets["test"])

In [None]:
trainer.train()

# Evaluate the model

In [None]:
results = trainer.evaluate()
print(results)

## Save the model and trained tokeniser

In [None]:
model.save_pretrained("./content/fine_tuned_models")
tokenizer.save_pretrained("./content/fine_tuned_models")