In [1]:
pip install nltk





[notice] A new release of pip is available: 23.2.1 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk

# Download necessary NLTK data files
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping chunkers\maxent_ne_chunker.zip.
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\words.zip.


True

In [3]:
# Sample text
text = "John Doe works at OpenAI in San Francisco. He loves programming and exploring new AI technologies."


# Tokenization:

# Splitting text into individual words or phrases.
# Example: "Natural Language Processing" -> ["Natural", "Language", "Processing"]

In [4]:
# Tokenization
tokens = word_tokenize(text)
print("Tokens:", tokens)

Tokens: ['John', 'Doe', 'works', 'at', 'OpenAI', 'in', 'San', 'Francisco', '.', 'He', 'loves', 'programming', 'and', 'exploring', 'new', 'AI', 'technologies', '.']


# Stop Words Removal:

# Removing common words that do not contribute much to the meaning (e.g., "and", "the").

In [5]:
# Stop Words Removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
print("Filtered Tokens:", filtered_tokens)

Filtered Tokens: ['John', 'Doe', 'works', 'OpenAI', 'San', 'Francisco', '.', 'loves', 'programming', 'exploring', 'new', 'AI', 'technologies', '.']


# Stemming and Lemmatization:

# Reducing words to their root form.
# Stemming: "running" -> "run"
# Lemmatization: "running" -> "run" (more accurate, considers context)


In [6]:

# Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
print("Stemmed Tokens:", stemmed_tokens)

Stemmed Tokens: ['john', 'doe', 'work', 'openai', 'san', 'francisco', '.', 'love', 'program', 'explor', 'new', 'ai', 'technolog', '.']


# Part-of-Speech (POS) Tagging:

# Identifying the grammatical category of each word.
# Example: "Natural" (Adjective), "Language" (Noun), "Processing" (Verb)

In [8]:
# POS Tagging
pos_tags = pos_tag(tokens)
print("POS Tags:", pos_tags)

POS Tags: [('John', 'NNP'), ('Doe', 'NNP'), ('works', 'VBZ'), ('at', 'IN'), ('OpenAI', 'NNP'), ('in', 'IN'), ('San', 'NNP'), ('Francisco', 'NNP'), ('.', '.'), ('He', 'PRP'), ('loves', 'VBZ'), ('programming', 'VBG'), ('and', 'CC'), ('exploring', 'VBG'), ('new', 'JJ'), ('AI', 'NNP'), ('technologies', 'NNS'), ('.', '.')]


# POS Tags Explained
NNP: Proper noun, singular
VBZ: Verb, 3rd person singular present
IN: Preposition or subordinating conjunction
.: Punctuation mark, period
PRP: Personal pronoun
VBG: Verb, gerund or present participle
CC: Coordinating conjunction
JJ: Adjective
NNS: Noun, plural

# The POS tags help in understanding the syntactic structure of the sentence. They can be used for various NLP tasks such as:

# Parsing: Building a syntactic parse tree of the sentence.
# Information Extraction: Identifying entities, relations, and events in text.
# Text-to-Speech: Determining the correct pronunciation based on word context.
# Machine Translation: Ensuring correct grammatical structure in translations.

# Named Entity Recognition (NER):

# Identifying and classifying named entities in text (e.g., names, dates, locations).

In [9]:

# Named Entity Recognition (NER)
named_entities = ne_chunk(pos_tags)
print("Named Entities:", named_entities)

Named Entities: (S
  (PERSON John/NNP)
  (ORGANIZATION Doe/NNP)
  works/VBZ
  at/IN
  (ORGANIZATION OpenAI/NNP)
  in/IN
  (GPE San/NNP Francisco/NNP)
  ./.
  He/PRP
  loves/VBZ
  programming/VBG
  and/CC
  exploring/VBG
  new/JJ
  AI/NNP
  technologies/NNS
  ./.)


In [10]:
# Display Named Entities
named_entities.draw()

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

# Sample documents
documents = [
    "The cat sat on the mat",
    "The dog ate my homework",
    "Cats and dogs are great pets"
]

# Create the Bag of Words model
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(documents)

# Convert the matrix to a dense format and print
print("Bag of Words Matrix:\n", X.toarray())

# Print the feature names (words)
print("Feature Names:\n", vectorizer.get_feature_names_out())


Bag of Words Matrix:
 [[0 0 0 1 0 0 0 0 0 1 0 1 0 1 2]
 [0 0 1 0 0 1 0 0 1 0 1 0 0 0 1]
 [1 1 0 0 1 0 1 1 0 0 0 0 1 0 0]]
Feature Names:
 ['and' 'are' 'ate' 'cat' 'cats' 'dog' 'dogs' 'great' 'homework' 'mat' 'my'
 'on' 'pets' 'sat' 'the']


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample documents
documents = [
    "The cat sat on the mat",
    "The dog ate my homework",
    "Cats and dogs are great pets"
]

# Create the TF-IDF model
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(documents)

# Convert the matrix to a dense format and print
print("TF-IDF Matrix:\n", X.toarray())

# Print the feature names (words)
print("Feature Names:\n", vectorizer.get_feature_names_out())


TF-IDF Matrix:
 [[0.         0.         0.         0.39798027 0.         0.
  0.         0.         0.         0.39798027 0.         0.39798027
  0.         0.39798027 0.60534851]
 [0.         0.         0.46735098 0.         0.         0.46735098
  0.         0.         0.46735098 0.         0.46735098 0.
  0.         0.         0.35543247]
 [0.40824829 0.40824829 0.         0.         0.40824829 0.
  0.40824829 0.40824829 0.         0.         0.         0.
  0.40824829 0.         0.        ]]
Feature Names:
 ['and' 'are' 'ate' 'cat' 'cats' 'dog' 'dogs' 'great' 'homework' 'mat' 'my'
 'on' 'pets' 'sat' 'the']


In [17]:
pip install gensim

Collecting gensimNote: you may need to restart the kernel to use updated packages.

  Obtaining dependency information for gensim from https://files.pythonhosted.org/packages/ab/b0/d58dc405fd60ab546ca714321235dc2d455b2dc06bfb4fc1092940c749fc/gensim-4.3.2-cp310-cp310-win_amd64.whl.metadata
  Downloading gensim-4.3.2-cp310-cp310-win_amd64.whl.metadata (8.5 kB)
Downloading gensim-4.3.2-cp310-cp310-win_amd64.whl (24.0 MB)
   ---------------------------------------- 0.0/24.0 MB ? eta -:--:--
   ---------------------------------------- 0.2/24.0 MB 6.9 MB/s eta 0:00:04
    --------------------------------------- 0.5/24.0 MB 6.7 MB/s eta 0:00:04
   - -------------------------------------- 0.7/24.0 MB 6.0 MB/s eta 0:00:04
   - -------------------------------------- 0.9/24.0 MB 5.8 MB/s eta 0:00:04
   - -------------------------------------- 1.1/24.0 MB 6.3 MB/s eta 0:00:04
   - -------------------------------------- 1.2/24.0 MB 5.1 MB/s eta 0:00:05
   -- ------------------------------------- 1.


[notice] A new release of pip is available: 23.2.1 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [18]:
import gensim
from gensim.models import Word2Vec

# Sample sentences
sentences = [
    ["the", "cat", "sat", "on", "the", "mat"],
    ["the", "dog", "ate", "my", "homework"],
    ["cats", "and", "dogs", "are", "great", "pets"]
]

# Train the Word2Vec model
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Get the vector for a specific word
vector = model.wv['cat']
print("Word Vector for 'cat':\n", vector)

# Find most similar words
similar_words = model.wv.most_similar('cat')
print("Words most similar to 'cat':\n", similar_words)


Word Vector for 'cat':
 [ 7.6966463e-03  9.1206422e-03  1.1355019e-03 -8.3250795e-03
  8.4250160e-03 -3.6962307e-03  5.7421732e-03  4.3915794e-03
  9.6899448e-03 -9.2934975e-03  9.2084054e-03 -9.2815282e-03
 -6.9077122e-03 -9.1021946e-03 -5.5471100e-03  7.3688962e-03
  9.1644777e-03 -3.3253515e-03  3.7230505e-03 -3.6252034e-03
  7.8814710e-03  5.8668759e-03  2.0861626e-07 -3.6286747e-03
 -7.2243060e-03  4.7686161e-03  1.4529788e-03 -2.6131857e-03
  7.8378068e-03 -4.0496145e-03 -9.1489861e-03 -2.2554707e-03
  1.2514711e-04 -6.6392552e-03 -5.4866159e-03 -8.4997769e-03
  9.2298733e-03  7.4240281e-03 -2.9524326e-04  7.3676636e-03
  7.9507884e-03 -7.8357337e-04  6.6120909e-03  3.7675237e-03
  5.0768424e-03  7.2529912e-03 -4.7393893e-03 -2.1855331e-03
  8.7312341e-04  4.2362059e-03  3.3043313e-03  5.0958274e-03
  4.5864857e-03 -8.4385090e-03 -3.1838394e-03 -7.2367596e-03
  9.6814223e-03  5.0065992e-03  1.7084122e-04  4.1129780e-03
 -7.6561309e-03 -6.2946510e-03  3.0763936e-03  6.5346383e-03


In [20]:
pip install transformers

Collecting transformersNote: you may need to restart the kernel to use updated packages.

  Obtaining dependency information for transformers from https://files.pythonhosted.org/packages/6a/dc/23c26b7b0bce5aaccf2b767db3e9c4f5ae4331bd47688c1f2ef091b23696/transformers-4.42.4-py3-none-any.whl.metadata
  Downloading transformers-4.42.4-py3-none-any.whl.metadata (43 kB)
     ---------------------------------------- 0.0/43.6 kB ? eta -:--:--
     ------------------------------------- -- 41.0/43.6 kB 1.9 MB/s eta 0:00:01
     ------------------------------------- -- 41.0/43.6 kB 1.9 MB/s eta 0:00:01
     -------------------------------------- 43.6/43.6 kB 355.2 kB/s eta 0:00:00
Collecting huggingface-hub<1.0,>=0.23.2 (from transformers)
  Obtaining dependency information for huggingface-hub<1.0,>=0.23.2 from https://files.pythonhosted.org/packages/69/d6/73f9d1b7c4da5f0544bc17680d0fa9932445423b90cd38e1ee77d001a4f5/huggingface_hub-0.23.4-py3-none-any.whl.metadata
  Downloading huggingface_hub-0


[notice] A new release of pip is available: 23.2.1 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [24]:
from transformers import pipeline, GPT2LMHeadModel, GPT2Tokenizer

# Load pre-trained model and tokenizer
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Initialize the pipeline for text generation
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

# Generate text
prompt = "The future of AI is"
generated_text = generator(prompt, max_length=50, num_return_sequences=1)
print("Generated Text:\n", generated_text)

# BERT for masked word prediction
from transformers import BertTokenizer, BertForMaskedLM
import torch

# Load pre-trained model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

# Encode text and create mask
# text = "The quick brown fox [MASK] over the lazy dog."
text = "Dang! I’m out fishing and a huge trout just [MASK] my line!"
input_ids = tokenizer.encode(text, return_tensors='pt')
mask_token_index = torch.where(input_ids == tokenizer.mask_token_id)[1]

# Predict masked word
with torch.no_grad():
    output = model(input_ids)
    mask_token_logits = output.logits[0, mask_token_index, :]
    mask_token_id = torch.argmax(mask_token_logits, dim=1)
    predicted_token = tokenizer.decode(mask_token_id)

print("Predicted masked word:\n", predicted_token)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Text:
 [{'generated_text': 'The future of AI is in the hands of people for free. And with a computer, computers are actually more powerful. You need a smart machine to do everything, or you need a computer to solve problems with a human. And now, with AI'}]


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Predicted masked word:
 hit


In [None]:
# Print the feature names (words)
print("Feature Names:\n", vectorizer.get_feature_names_out()
      

In [1]:
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers

max_features = 20000  # Only consider the top 20k words
maxlen = 200  # Only consider the first 200 words of each movie review

def get_model():
    # Input for variable-length sequences of integers
    inputs = keras.Input(shape=(None,), dtype="int32")
    # Embed each integer in a 128-dimensional vector
    x = layers.Embedding(max_features, 128)(inputs)
    # Add 2 bidirectional LSTMs
    x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
    x = layers.Bidirectional(layers.LSTM(64))(x)
    # Add a classifier
    outputs = layers.Dense(1, activation="sigmoid")(x)
    model = keras.Model(inputs, outputs)
    return model

model = get_model()
model.summary()



Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding (Embedding)       (None, None, 128)         2560000   
                                                                 
 bidirectional (Bidirection  (None, None, 128)         98816     
 al)                                                             
                                                                 
 bidirectional_1 (Bidirecti  (None, 128)               98816     
 onal)                                                           
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 2757761 (10.52 MB)
Trainable params: 2757761 

In [2]:

(x_train, y_train), (x_val, y_val) = keras.datasets.imdb.load_data(
    num_words=max_features
)

x_train = keras.preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
x_val = keras.preprocessing.sequence.pad_sequences(x_val, maxlen=maxlen)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [3]:
model.compile("adam", "binary_crossentropy", metrics=["accuracy"])
model.fit(x_train, y_train, batch_size=32, epochs=3, validation_data=(x_val, y_val))


Epoch 1/3


Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x1d42f6b71f0>