In [None]:
!pip install transformers

In [None]:
from transformers import pipeline

In [None]:
#-------------------------------------------------------#
#                NLP TASKS                              #
#-------------------------------------------------------#

'''
1. Text Classification: Assigning a category to a piece of text
Sentiment analysis
Spam detection
'''

classifier = pipeline("text-classification")

'''
2. Token classification: Assigning a label to each token in a sequence
Named entity recognition (NER)
Part-of-speech tagging
'''

token_classifier = pipeline("token-classification")

'''
3. Question answering: Answering a question based on a context
'''

question_answerer = pipeline("question-answering")


'''
4. Text generation: Generating new text based on a given prompt
Language modeling
Story generation
'''

generator = pipeline("text-generation")


'''
5. Summarization: Generating a concise summary of a longer text
'''

summarizer = pipeline("summarization")


'''
5. Translation: Converting text from one language to another
'''

translator = pipeline("translation",model="Helsinki-NLP/opus-mt-en-fr")


'''
6. Text2Text generation: Generating new text based on a given prompt, General purpose text summarization including
translation and summarization
'''

text2text_generator = pipeline("text2text-generation")


'''
7. Fill-mask: Filling in the blanks in a given text, predicting missing words in a sentence
'''

unmasker = pipeline("fill-mask")


'''
8. Feature extraction: Extracting hidden states or features from text.
'''

feature_extractor = pipeline("feature-extraction")


'''
9. Sentence similarity: Measuring the similarity between two sentences
'''

similarity = pipeline("sentence-similarity")



#-------------------------------------------------------#
#                COMPUTER VISION TASKS                  #
#-------------------------------------------------------#

'''
1. Image classification: Classifying the main content of an image
'''

image_classifier = pipeline("image-classification")


'''
2. Object detection: Identifying and localizing objects in an image
'''

object_detector = pipeline("object-detection")


'''
3. Image segmentation: Assigning a label to each pixel in an image
'''

image_segmenter = pipeline("image-segmentation")


'''
4. Image generation: Generating new images based on a given prompt (Using DALL-E or similar models)
'''

image_generator = pipeline("image-generation")


#-------------------------------------------------------#
#                SPEECH PROCESSING TASKS                #
#-------------------------------------------------------#

'''
1. Automatic speech recognition(ASR): Converting spoken language into text
'''
speech_recognizer = pipeline("automatic-speech-recognition")


'''
2. Text-to-speech(TTS): Converting text into spoken language
'''

text_to_speech = pipeline("text-to-speech")

'''
3. Speech translation: Converting speech from one language to another
'''

speech_translator = pipeline("speech-translation")


'''
4. Audio classification: Classifying the main content of an audio file
'''

audio_classifier = pipeline("audio-classification")


'''
5. Audio transcription: Converting spoken language into text
'''
audio_recognizer = pipeline("audio-transcription")


#-------------------------------------------------------#
#                MULTIMODAL TASKS                       #
#-------------------------------------------------------#

'''
1. Image captioning: Generating a descriptive caption for an image
'''

image_captioner = pipeline("image-to-text")


'''
2. Visual question answering (VQA) : Answering a question about an image
'''

visual_question_answerer = pipeline("visual-question-answering")


#-------------------------------------------------------#
#                OTHER TASKS                            #
#-------------------------------------------------------#

'''
1. Table question answering: Answering a question about a table
'''

table_question_answerer = pipeline("table-question-answering")


'''
2. Document question answering: Extracting answers from documents like PDF
'''

document_question_answerer = pipeline("document-question-answering")


'''
3. Time series forecasting: Predicting future values of a time series
'''

time_series_forecaster = pipeline("time-series-forecasting")


'''
4. Anomaly detection: Identifying unusual patterns in a dataset
'''

anomaly_detector = pipeline("anomaly-detection")
'''


# **NLP TASKS**

## **Sentiment analysis**

In [None]:
from transformers import pipeline

In [None]:
classifier = pipeline("sentiment-analysis")

result = classifier("I hate you")[0]
print(f"label: {result['label']}, with score: {round(result['score'], 4)}")

In [None]:
pipeline(task = 'sentiment-analysis')('I was very confused with the new batman movie')

In [None]:
pipeline(task = 'sentiment-analysis', model = 'facebook/bart-large-mnli')\
                            ('I was very confused with the new batman movie')

## **Batch Sentiment Analysis**

In [None]:
classifier = pipeline('sentiment-analysis')

task_list = ['I really like autoencoders, best model for anomaly detection', \
             'I am not sure if we can actually evaluate LLMS.', \
             'PassiveAggressive is the name of linear regression model that so many people dont know.' , \
             'I hate long meetings.']

classifier(task_list)

In [None]:
classifier = pipeline('sentiment-analysis', model='SamLowe/roberta-base-go_emotions')

task_list = ['I really like autoencoders, best model for anomaly detection', \
             'I am not sure if we can actually evaluate LLMS.', \
             'PassiveAggressive is the name of linear regression model that so many people dont know.' , \
             'I hate long meetings.']

classifier(task_list)

## **Text Generation**

In [None]:
from transformers import pipeline

text_generator = pipeline('text-generation' , model = 'distilbert/distilgpt2')
generated_text = text_generator('Today is a rainy day in London',
                                truncation = True,
                                num_return_sequences=2)
print('Generated text:\n', generated_text[0]['generated_text'])

## **Question Answering**

In [None]:
from transformers import pipeline

qa_model = pipeline('question-answering')
question = 'What is my job?'
context = 'I am developing AI models with Python'
answer = qa_model(question=question , context = context)
answer

## **Tokenization**

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DistilBertTokenizer, DistilBertForSequenceClassification

In [None]:
'''
model_name1 = 'distilbert-base-uncased-finetuned-sst-2-english'
my_tokenizer1 = DistilBertTokenizer.from_pretrained(model_name1)
my_model1 = DistilBertForSequenceClassification.from_pretrained(model_name1)

classifier = pipeline('sentiment-analysis')
res = classifier('I was not so happy with the Batman movie')
res

'''


model_name2 = 'nlptown/bert-base-multilingual-uncased-sentiment'
my_tokenizer2 = AutoTokenizer.from_pretrained(model_name2)
my_model2 = AutoModelForSequenceClassification.from_pretrained(model_name2)

classifier = pipeline('sentiment-analysis' ,model = my_model2 , tokenizer = my_tokenizer2)
res = classifier('I was not so happy with the Batman movie')
res

In [None]:
from transformers import AutoTokenizer

# Load a pre trained tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

# Example text
text = "I was not so happy with the batman movie"

# Tokenize the text
tokens = tokenizer.tokenize(text)

# Print the tokens
print("Tokens: ", tokens)

In [None]:
# Convert tokens into input ID's
input_ids = tokenizer.convert_tokens_to_ids(tokens)

# Print the input IDs
print("Input IDs: ", input_ids)

In [None]:
# Encode the text (tokenization+converting into Input ID's)
encoded_input = tokenizer(text)

# Print the encoded input
print("Encoded Input: ", encoded_input)


In [None]:
# Decode the text
decoded_output = tokenizer.decode(encoded_input['input_ids'])

# Print the decoded output
print("Decoded Output: ", decoded_output)

# **Fine Tuning IMDB**

In [None]:
!pip install datasets # if you want to use hugging face datasets so you have to download this library

## **Loading and Preparing the dataset**

In [None]:
from datasets import load_dataset
dataset = load_dataset('imdb')

In [None]:
dataset

## **Pre Processing the data**

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=64)
    # padding: Matlb pehly maximum length ka sentence dhond kar uske baad baki short sentences ko oske brabar karna hy 0 laga kar

tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [None]:
tokenized_datasets

## **Setting up the training arguements**

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(output_dir='./results',
         eval_strategy='epoch',
         learning_rate = 2e-5,
         per_device_train_batch_size=16,
         per_device_eval_batch_size=16,
         num_train_epochs=0.3,
         weight_decay=0.01)

training_args


## **Initialize the Model**

In [None]:
# Loading the pre trained model and defining the training process

from transformers import AutoModelForSequenceClassification, Trainer

# load the pre trained model
model = AutoModelForSequenceClassification.from_pretrained('bert-base-cased', num_labels=2) # beacause we are having 2 labels +ve,-ve

# Initialize the Trainer
trainer = Trainer (
    model = model,
    args = training_args,
    train_dataset = tokenized_datasets['train'],
    eval_dataset = tokenized_datasets['test']
)


## **Train the model**

In [None]:
trainer.train()

## **Evaluate The model**

In [None]:
results = trainer.evaluate()
results

## **Save the fine tuned model**

In [None]:
model.save_pretrained('./fine-tuned-model')
tokenizer.save_pretrained('./fine-tuned-model')

In [None]:
# Testing the model with a movie review
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
model_path = './fine-tuned-model'

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

classifier = pipeline('sentiment-analysis', model = model, tokenizer = tokenizer)

review = "The movie was absolutely fantastic, I loved the story and acting!"
result = classifier(review)

print(result)

# **Arxiv Project**

In [None]:
!pip install arxiv

In [2]:
import arxiv # Research paper related Library
import pandas as pd

In [None]:
# Query to fetch AI related papers
query = 'ai or Artificial Intelligence or machine learning'
search = arxiv.Search(query = query , max_results = 10 , sort_by=arxiv.SortCriterion.SubmittedDate)

# Fetch papers
papers = []
for result in search.results():
    papers.append(
        {
            'published':result.published,
            'title': result.title,
            'abstract': result.summary,
            'category': result.categories
        }
    )

# Convert into dataframe
df = pd.DataFrame(papers)

pd.set_option('display.max_colwidth', None)
df.head(10)

In [None]:
! pip install transformers
from transformers import pipeline

In [None]:
# Example abstract from API
abstract = df['abstract'][0]

summarizer = pipeline("summarization", model='facebook/bart-large-cnn')

# Summarization
summarization_result = summarizer(abstract)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [8]:
summarization_result[0]['summary_text']

'Genie Envisioner is a unified world foundation platform for robotic manipulation. It integrates policy learning, evaluation, and simulation within a single video-generative framework. GE-Base is a large-scale, instruction-conditioned video diffusion model that captures dynamics of real-world robotic interactions in structured latent space.'