***

1.Basic

In [1]:
# Import necessary libraries
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag, ne_chunk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

# Ensure necessary resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('wordnet')

# Sample text
text = "Hello! My name is John. I'm learning NLP. NLP is an exciting field of AI."

# Tokenization
def tokenization_example(text):
    print("\nTokenization:")
    words = word_tokenize(text)
    sentences = sent_tokenize(text)
    print("Words:", words)
    print("Sentences:", sentences)

# Text Preprocessing: Lowercasing and Removing Punctuation
def preprocessing_example(text):
    print("\nText Preprocessing:")
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    print("Processed Text:", text)
    return text

# Stop Words Removal
def stopwords_removal_example(text):
    print("\nStop Words Removal:")
    words = word_tokenize(text)
    filtered_words = [word for word in words if word not in stopwords.words('english')]
    print("Filtered Words:", filtered_words)
    return filtered_words

# POS Tagging
def pos_tagging_example(words):
    print("\nPOS Tagging:")
    pos_tags = pos_tag(words)
    print("POS Tags:", pos_tags)

# Named Entity Recognition (NER)
def ner_example(text):
    print("\nNamed Entity Recognition:")
    words = word_tokenize(text)
    pos_tags = pos_tag(words)
    named_entities = ne_chunk(pos_tags)
    print(named_entities)

# Stemming and Lemmatization
def stemming_and_lemmatization_example(words):
    print("\nStemming and Lemmatization:")
    porter_stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    for word in words:
        print(f"{word} -> Stem: {porter_stemmer.stem(word)}, Lemma: {lemmatizer.lemmatize(word, pos='v')}")

# Text Encoding
def text_encoding_example(documents):
    print("\nText Encoding:")
    # Bag of Words
    vectorizer = CountVectorizer()
    bow = vectorizer.fit_transform(documents)
    print("Bag of Words:\n", bow.toarray())
    print("Feature Names:", vectorizer.get_feature_names_out())

    # TF-IDF
    tfidf_vectorizer = TfidfVectorizer()
    tfidf = tfidf_vectorizer.fit_transform(documents)
    print("\nTF-IDF:\n", tfidf.toarray())
    print("Feature Names:", tfidf_vectorizer.get_feature_names_out())

# Cosine Similarity
def cosine_similarity_example(documents):
    print("\nCosine Similarity:")
    tfidf_vectorizer = TfidfVectorizer()
    tfidf = tfidf_vectorizer.fit_transform(documents)
    similarity_matrix = cosine_similarity(tfidf)
    print("Cosine Similarity Matrix:\n", similarity_matrix)

# Sample documents for encoding and cosine similarity
documents = [
    "I love programming in Python. Python is great for NLP.",
    "NLP is an exciting field. It involves processing text and speech.",
    "Text preprocessing is an important step in NLP."
]

# Calling the functions
tokenization_example(text)
processed_text = preprocessing_example(text)
filtered_words = stopwords_removal_example(processed_text)
pos_tagging_example(filtered_words)
ner_example(text)
stemming_and_lemmatization_example(filtered_words)
text_encoding_example(documents)
cosine_similarity_example(documents)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...



Tokenization:
Words: ['Hello', '!', 'My', 'name', 'is', 'John', '.', 'I', "'m", 'learning', 'NLP', '.', 'NLP', 'is', 'an', 'exciting', 'field', 'of', 'AI', '.']
Sentences: ['Hello!', 'My name is John.', "I'm learning NLP.", 'NLP is an exciting field of AI.']

Text Preprocessing:
Processed Text: hello my name is john im learning nlp nlp is an exciting field of ai

Stop Words Removal:
Filtered Words: ['hello', 'name', 'john', 'im', 'learning', 'nlp', 'nlp', 'exciting', 'field', 'ai']

POS Tagging:
POS Tags: [('hello', 'NN'), ('name', 'NN'), ('john', 'NN'), ('im', 'NN'), ('learning', 'VBG'), ('nlp', 'JJ'), ('nlp', 'JJ'), ('exciting', 'VBG'), ('field', 'NN'), ('ai', 'NN')]

Named Entity Recognition:
(S
  (GPE Hello/NN)
  !/.
  My/PRP$
  name/NN
  is/VBZ
  (PERSON John/NNP)
  ./.
  I/PRP
  'm/VBP
  learning/VBG
  (ORGANIZATION NLP/NNP)
  ./.
  (ORGANIZATION NLP/NNP)
  is/VBZ
  an/DT
  exciting/JJ
  field/NN
  of/IN
  (ORGANIZATION AI/NNP)
  ./.)

Stemming and Lemmatization:
hello -> Stem: 

2.Intermediate

In [2]:
# Import necessary libraries
import nltk
import spacy
from gensim.models import Word2Vec, KeyedVectors
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, GRU
from transformers import BertTokenizer, TFBertModel
import numpy as np
import re

# Ensure necessary resources are downloaded
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Sample text
text = "Natural Language Processing (NLP) is a field of Artificial Intelligence (AI) that focuses on the interaction between computers and humans through natural language."

# Advanced Embeddings
def word2vec_example(sentences):
    print("\nWord2Vec Embeddings:")
    model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
    word_vectors = model.wv
    print("Word2Vec vector for 'language':", word_vectors['language'])

# Prepare sample sentences for Word2Vec
sentences = [
    ["natural", "language", "processing", "nlp", "field", "artificial", "intelligence"],
    ["focuses", "interaction", "computers", "humans", "natural", "language"]
]

# Load pre-trained word vectors
def load_pretrained_word_vectors():
    print("\nLoading Pre-trained Word Vectors:")
    word_vectors = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
    print("Pre-trained vector for 'language':", word_vectors['language'])

# RNNs, LSTMs, and GRUs
def rnn_lstm_gru_example():
    print("\nRNN, LSTM, and GRU Example:")

    # Sample data
    sample_text = "NLP is fun."
    sample_text = re.sub(r'[^\w\s]', '', sample_text).lower().split()
    word_index = {word: i for i, word in enumerate(set(sample_text))}
    sequences = np.array([[word_index[word] for word in sample_text]])

    # Model parameters
    vocab_size = len(word_index)
    embedding_dim = 8
    input_length = len(sample_text)

    # RNN Model
    rnn_model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=input_length),
        tf.keras.layers.SimpleRNN(10),
        Dense(1, activation='sigmoid')
    ])
    rnn_model.compile(optimizer='adam', loss='binary_crossentropy')
    print("RNN Model Summary:")
    rnn_model.summary()

    # LSTM Model
    lstm_model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=input_length),
        LSTM(10),
        Dense(1, activation='sigmoid')
    ])
    lstm_model.compile(optimizer='adam', loss='binary_crossentropy')
    print("\nLSTM Model Summary:")
    lstm_model.summary()

    # GRU Model
    gru_model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=input_length),
        GRU(10),
        Dense(1, activation='sigmoid')
    ])
    gru_model.compile(optimizer='adam', loss='binary_crossentropy')
    print("\nGRU Model Summary:")
    gru_model.summary()

# Attention Mechanism
def attention_mechanism_example():
    print("\nAttention Mechanism Example:")

    # Sample input
    input_length = 5
    input_dim = 10
    timesteps = input_length
    input_data = tf.random.normal((1, timesteps, input_dim))

    # Define attention layer
    attention = tf.keras.layers.Attention()
    query = tf.keras.layers.Dense(input_dim)(input_data)
    value = tf.keras.layers.Dense(input_dim)(input_data)
    context_vector, attention_weights = attention([query, value], return_attention_scores=True)

    print("Context Vector Shape:", context_vector.shape)
    print("Attention Weights Shape:", attention_weights.shape)

# Transformers and BERT
def transformers_bert_example():
    print("\nTransformers and BERT Example:")

    # Load BERT model and tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = TFBertModel.from_pretrained('bert-base-uncased')

    # Sample text
    sample_text = "NLP is a fascinating field."

    # Tokenize input
    inputs = tokenizer(sample_text, return_tensors='tf')
    outputs = model(inputs)

    print("BERT Model Outputs Shape:", outputs.last_hidden_state.shape)

# Calling the functions
word2vec_example(sentences)
# Uncomment the following line after downloading the 'GoogleNews-vectors-negative300.bin' model
# load_pretrained_word_vectors()
rnn_lstm_gru_example()
attention_mechanism_example()
transformers_bert_example()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



Word2Vec Embeddings:
Word2Vec vector for 'language': [-5.3622725e-04  2.3643136e-04  5.1033497e-03  9.0092728e-03
 -9.3029495e-03 -7.1168090e-03  6.4588725e-03  8.9729885e-03
 -5.0154282e-03 -3.7633716e-03  7.3805046e-03 -1.5334714e-03
 -4.5366134e-03  6.5540518e-03 -4.8601604e-03 -1.8160177e-03
  2.8765798e-03  9.9187379e-04 -8.2852151e-03 -9.4488179e-03
  7.3117660e-03  5.0702621e-03  6.7576934e-03  7.6286553e-04
  6.3508903e-03 -3.4053659e-03 -9.4640139e-04  5.7685734e-03
 -7.5216377e-03 -3.9361035e-03 -7.5115822e-03 -9.3004224e-04
  9.5381187e-03 -7.3191668e-03 -2.3337686e-03 -1.9377411e-03
  8.0774371e-03 -5.9308959e-03  4.5162440e-05 -4.7537340e-03
 -9.6035507e-03  5.0072931e-03 -8.7595852e-03 -4.3918253e-03
 -3.5099984e-05 -2.9618145e-04 -7.6612402e-03  9.6147433e-03
  4.9820580e-03  9.2331432e-03 -8.1579173e-03  4.4957981e-03
 -4.1370760e-03  8.2453608e-04  8.4986202e-03 -4.4621765e-03
  4.5175003e-03 -6.7869602e-03 -3.5484887e-03  9.3985079e-03
 -1.5776526e-03  3.2137157e-04 

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

BERT Model Outputs Shape: (1, 9, 768)


3.Advance

In [3]:
# Import necessary libraries
import nltk
import spacy
from gensim.models import Word2Vec
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, GRU, Attention
from transformers import BertTokenizer, TFBertModel
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np
import re
from nltk.translate.bleu_score import sentence_bleu
from transformers import T5Tokenizer, TFT5ForConditionalGeneration

# Ensure necessary resources are downloaded
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Sample text
text = "Natural Language Processing (NLP) is a field of Artificial Intelligence (AI) that focuses on the interaction between computers and humans through natural language."

# Sequence-to-Sequence Models (Encoder-Decoder)
def seq2seq_example():
    print("\nSequence-to-Sequence Example:")
    # Example sentence pairs for translation
    input_texts = ["Hello", "How are you?", "Goodbye"]
    target_texts = ["Bonjour", "Comment ça va?", "Au revoir"]

    # Tokenization
    tokenizer = tf.keras.preprocessing.text.Tokenizer()
    tokenizer.fit_on_texts(input_texts + target_texts)
    input_sequences = tokenizer.texts_to_sequences(input_texts)
    target_sequences = tokenizer.texts_to_sequences(target_texts)

    # Padding
    input_sequences = tf.keras.preprocessing.sequence.pad_sequences(input_sequences, padding='post')
    target_sequences = tf.keras.preprocessing.sequence.pad_sequences(target_sequences, padding='post')

    # Encoder-Decoder Model
    vocab_size = len(tokenizer.word_index) + 1
    embedding_dim = 8
    input_length = input_sequences.shape[1]
    target_length = target_sequences.shape[1]

    encoder_inputs = tf.keras.layers.Input(shape=(input_length,))
    encoder_embedding = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim)(encoder_inputs)
    encoder_lstm = LSTM(50, return_state=True)
    _, state_h, state_c = encoder_lstm(encoder_embedding)
    encoder_states = [state_h, state_c]

    decoder_inputs = tf.keras.layers.Input(shape=(target_length,))
    decoder_embedding = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim)(decoder_inputs)
    decoder_lstm = LSTM(50, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
    decoder_dense = Dense(vocab_size, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)

    model = tf.keras.models.Model([encoder_inputs, decoder_inputs], decoder_outputs)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

    print(model.summary())

# Text Summarization using T5
def text_summarization_example():
    print("\nText Summarization Example:")
    summarizer = TFT5ForConditionalGeneration.from_pretrained('t5-small')
    tokenizer = T5Tokenizer.from_pretrained('t5-small')

    text = "Natural Language Processing (NLP) is a field of Artificial Intelligence (AI) that focuses on the interaction between computers and humans through natural language. The ultimate objective of NLP is to enable computers to understand, interpret, and generate human languages in a way that is valuable."
    input_ids = tokenizer.encode("summarize: " + text, return_tensors="tf")
    summary_ids = summarizer.generate(input_ids, max_length=50, num_beams=5, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    print("Summary:", summary)

# Machine Translation using MarianMT
def machine_translation_example():
    print("\nMachine Translation Example:")
    from transformers import MarianMTModel, MarianTokenizer

    model_name = 'Helsinki-NLP/opus-mt-en-fr'
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)

    text = "NLP is a fascinating field."
    translated = model.generate(**tokenizer(text, return_tensors="pt", padding=True))
    translation = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]

    print("Translation:", translation[0])

# Topic Modeling using LDA
def topic_modeling_example():
    print("\nTopic Modeling Example:")
    documents = [
        "I love reading books about AI and NLP.",
        "Natural language processing is an interesting field.",
        "Deep learning is a key technology in AI.",
        "I enjoy studying machine learning and artificial intelligence."
    ]

    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(documents)

    lda = LatentDirichletAllocation(n_components=2, random_state=42)
    lda.fit(X)

    terms = vectorizer.get_feature_names_out()
    for idx, topic in enumerate(lda.components_):
        print(f"Topic {idx + 1}:")
        print(" ".join([terms[i] for i in topic.argsort()[:-6:-1]]))

# Calling the functions
seq2seq_example()
text_summarization_example()
machine_translation_example()
topic_modeling_example()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



Sequence-to-Sequence Example:
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 3)]                  0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 3)]                  0         []                            
                                                                                                  
 embedding_3 (Embedding)     (None, 3, 8)                 96        ['input_1[0][0]']             
                                                                                                  
 embedding_4 (Embedding)     (None, 3, 8)                 96        ['input_2[0][0]']             
                                                               

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Summary: natural language processing (NLP) focuses on the interaction between computers and humans through natural language. the ultimate objective of NLP is to enable computers to understand, interpret, and generate human languages.

Machine Translation Example:


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Translation: Le NLP est un domaine fascinant.

Topic Modeling Example:
Topic 1:
learning deep key technology machine
Topic 2:
reading nlp love books language


****