In [None]:
# 1. What is the primary goal of Natural Language Processing (NLP)?
# To enable computers to understand, interpret, and generate human language in a way that is both meaningful and useful.

# 2. What does "tokenization" refer to in text processing?
# Tokenization is the process of breaking down text into smaller units called tokens, such as words or sentences, for easier analysis.

# 3. What is the difference between lemmatization and stemming?

# Stemming: Cuts off word endings to reduce words to their root form, which may not be a valid word (e.g., "running" → "run").

# Lemmatization: Reduces words to their base or dictionary form (lemma) using vocabulary and morphological analysis (e.g., "running" → "run").

# 4. What is the role of regular expressions (regex) in text processing?
# Regex is used to search, match, and manipulate specific patterns of text efficiently, such as extracting email addresses or cleaning unwanted characters.

# 5. What is Word2Vec and how does it represent words in a vector space?
# Word2Vec is a neural network-based method that represents words as dense vectors in a continuous vector space, capturing semantic relationships between words.

# 6. How does frequency distribution help in text analysis?
# It helps identify how often each word appears, allowing analysis of common terms, trends, and important keywords in text data.

# 7. Why is text normalization important in NLP?
# Normalization ensures consistency by converting text to a standard format (e.g., lowercasing, removing punctuation), improving model performance.

# 8. What is the difference between sentence tokenization and word tokenization?

# Sentence tokenization splits text into sentences.

# Word tokenization splits text into words or tokens.

# 9. What are co-occurrence vectors in NLP?
# Vectors that represent how frequently words appear together in a context window, used to capture word relationships.

# 10. What is the significance of lemmatization in improving NLP tasks?
# It improves accuracy by grouping different forms of a word into one, reducing sparsity in the data.

# 11. What is the primary use of word embeddings in NLP?
# To convert words into numerical vectors capturing semantic meaning for machine learning models.

# 12. What is an annotator in NLP?
# A tool or module that adds metadata like part-of-speech tags or named entities to raw text for further analysis.

# 13. What are the key steps in text processing before applying machine learning models?
# Tokenization, stopword removal, normalization, stemming/lemmatization, vectorization/embedding.

# 14. What is the history of NLP and how has it evolved?
# From rule-based systems to statistical models to deep learning and transformer architectures for better language understanding.

# 15. Why is sentence processing important in NLP?
# Sentences provide context and structure crucial for tasks like parsing, sentiment analysis, and machine translation.

# 16. How do word embeddings improve the understanding of language semantics in NLP?
# They capture relationships between words based on context, allowing models to understand similarity and analogy.

# 17. How does the frequency distribution of words help in text classification?
# By highlighting key discriminative words that distinguish between classes.

# 18. What are the advantages of using regex in text cleaning?
# Efficient pattern matching and extraction of relevant parts of text with minimal code.

# 19. What is the difference between Word2Vec and Doc2Vec?

# Word2Vec generates embeddings for words.

# Doc2Vec generates embeddings for entire documents or sentences.

# 20. Why is understanding text normalization important in NLP?
# It reduces variability in text, making patterns easier to learn by models.

# 21. How does word count help in text analysis?
# It provides a simple quantitative measure of term importance and document length.

# 22. How does lemmatization help in NLP tasks like search engines and chatbots?
# It improves retrieval and understanding by matching words to their base form.

# 23. What is the purpose of using Doc2Vec in text processing?
# To generate fixed-length vector representations for variable-length documents for classification or clustering.

# 24. What is the importance of sentence processing in NLP?
# It helps in understanding context, relationships, and intent at a higher level than words.

# 25. What is text normalization, and what are the common techniques used in it?
# Text normalization standardizes text; common techniques include lowercasing, removing punctuation, and expanding contractions.

# 26. Why is word tokenization important in NLP?
# It breaks text into meaningful units (words) that are easier to analyze.

# 27. How does sentence tokenization differ from word tokenization in NLP?
# Sentence tokenization separates text into sentences; word tokenization splits sentences into words.

# 28. What is the primary purpose of text processing in NLP?
# To convert raw text into a structured format suitable for modeling.

# 29. What are the key challenges in NLP?
# Ambiguity, context understanding, language variability, sarcasm, and idiomatic expressions.

# 30. How do co-occurrence vectors represent relationships between words?
# By capturing which words appear near each other in text, indicating semantic or syntactic connections.

# 31. What is the role of frequency distribution in text analysis?
# Identifies important and frequent words for understanding and feature selection.

# 32. What is the impact of word embeddings on NLP tasks?
# They improve model accuracy by providing dense, semantically rich input features.

# 33. What is the purpose of using lemmatization in text preprocessing?
# To reduce word forms to a common base, reducing noise and improving learning.

In [None]:
# practical question /

In [1]:
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')
text = "NLP helps computers understand human language."
tokens = word_tokenize(text)
print(tokens)


['NLP', 'helps', 'computers', 'understand', 'human', 'language', '.']


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
from nltk.tokenize import sent_tokenize

text = "NLP is amazing. It powers chatbots and voice assistants."
sentences = sent_tokenize(text)
print(sentences)


['NLP is amazing.', 'It powers chatbots and voice assistants.']


In [3]:
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
filtered = [word for word in tokens if word.lower() not in stop_words]
print(filtered)


['NLP', 'helps', 'computers', 'understand', 'human', 'language', '.']


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
words = ["running", "flies", "easily", "fairly"]
stemmed = [stemmer.stem(word) for word in words]
print(stemmed)


['run', 'fli', 'easili', 'fairli']


In [5]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
words = ["running", "better", "flies"]
lemmatized = [lemmatizer.lemmatize(word) for word in words]
print(lemmatized)


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...


['running', 'better', 'fly']


In [6]:
import string

text = "NLP, is FUN!!"
normalized = text.lower().translate(str.maketrans('', '', string.punctuation))
print(normalized)


nlp is fun


In [7]:
import string

text = "NLP, is FUN!!"
normalized = text.lower().translate(str.maketrans('', '', string.punctuation))
print(normalized)


nlp is fun


In [8]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

documents = ["I love NLP", "Doc2Vec converts text to vector"]
tagged = [TaggedDocument(doc.split(), [i]) for i, doc in enumerate(documents)]

model = Doc2Vec(tagged, vector_size=50, window=2, min_count=1, epochs=100)
print(model.dv[0])  # vector for the first document


ModuleNotFoundError: No module named 'gensim'

In [9]:
nltk.download('averaged_perceptron_tagger')

text = "NLP is fun and educational"
tokens = word_tokenize(text)
pos_tags = nltk.pos_tag(tokens)
print(pos_tags)


[('NLP', 'NNP'), ('is', 'VBZ'), ('fun', 'NN'), ('and', 'CC'), ('educational', 'JJ')]


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [10]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Barack Obama was born in Hawaii and was the president of the USA.")

for ent in doc.ents:
    print(ent.text, ent.label_)


ModuleNotFoundError: No module named 'spacy'