In [None]:
Q1. Write a unique paragraph (5–6 sentences) about your favorite topic.
Let's say your favorite topic is Technology
Paragraph:
Technology has transformed the world into a global village. With the rise of artificial intelligence and machine learning, machines are becoming more intelligent every day. Smartphones have become essential tools for both communication and productivity. Cloud computing allows people to access their data from anywhere in the world. Innovations like virtual reality and augmented reality are redefining entertainment and education.

Q1 Solution:
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from collections import Counter

# Ensure required NLTK data is downloaded
nltk.download('punkt')
nltk.download('stopwords')

paragraph = """Technology has transformed the world into a global village. With the rise of artificial intelligence and machine learning, machines are becoming more intelligent every day. Smartphones have become essential tools for both communication and productivity. Cloud computing allows people to access their data from anywhere in the world. Innovations like virtual reality and augmented reality are redefining entertainment and education."""

# 1. Lowercase and remove punctuation
lowered = paragraph.lower()
cleaned = re.sub(r'[^\w\s]', '', lowered)

# 2. Tokenization
word_tokens = word_tokenize(cleaned)
sent_tokens = sent_tokenize(paragraph)

# 3. Split vs word_tokenize
split_tokens = cleaned.split()
print("Split tokens:", split_tokens[:10])
print("Word_tokenize tokens:", word_tokens[:10])

# 4. Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_words = [w for w in word_tokens if w not in stop_words]

# 5. Word frequency (excluding stopwords)
freq_dist = Counter(filtered_words)
print("Word Frequency Distribution (no stopwords):", freq_dist)


Q2. Using the same paragraph from Q1:
Q2 Solution:
from nltk.stem import PorterStemmer, WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

# 1. Extract alphabetic words only
alphabetic_words = re.findall(r'\b[a-zA-Z]+\b', cleaned)

# 2. Remove stop words
filtered_alpha = [w for w in alphabetic_words if w not in stop_words]

# 3. Stemming
stemmer = PorterStemmer()
stemmed = [stemmer.stem(word) for word in filtered_alpha]

# 4. Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(word) for word in filtered_alpha]

# 5. Comparison
print("Stemmed Words:", stemmed[:10])
print("Lemmatized Words:", lemmatized[:10])

# Explanation:
print("\nExplanation: Stemming is faster but less accurate (e.g., 'machines' becomes 'machin'), while Lemmatization gives meaningful roots (e.g., 'machines' → 'machine'). Prefer lemmatization for tasks needing correct grammar or human readability.")

Q3. Choose 3 short texts and analyze with BoW & TF-IDF
Q3 Solution:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

texts = [
    "The phone has excellent battery life and a sleek design.",
    "The camera quality is poor, and the phone lags frequently.",
    "Great value for money and smooth user interface."
]

# 1. Bag of Words
count_vect = CountVectorizer()
bow_matrix = count_vect.fit_transform(texts)
print("Bag of Words:\n", bow_matrix.toarray())

# 2. TF-IDF
tfidf_vect = TfidfVectorizer()
tfidf_matrix = tfidf_vect.fit_transform(texts)
print("TF-IDF Matrix:\n", tfidf_matrix.toarray())

# 3. Top 3 keywords
import numpy as np
feature_names = tfidf_vect.get_feature_names_out()
for i, text in enumerate(texts):
    print(f"\nTop keywords for Text {i+1}:")
    row = tfidf_matrix[i].toarray()[0]
    top_indices = np.argsort(row)[-3:][::-1]
    for idx in top_indices:
        print(f"{feature_names[idx]}: {row[idx]:.3f}")


Q4. Compare two technologies
Q4 Solution:
text1 = "Artificial Intelligence allows machines to mimic human intelligence and perform tasks autonomously."
text2 = "Blockchain is a decentralized ledger technology that provides secure and transparent transactions."

# Preprocessing
def preprocess(text):
    text = re.sub(r'[^\w\s]', '', text.lower())
    return word_tokenize(text)

tokens1 = preprocess(text1)
tokens2 = preprocess(text2)

# a. Jaccard Similarity
set1, set2 = set(tokens1), set(tokens2)
jaccard_sim = len(set1 & set2) / len(set1 | set2)
print("Jaccard Similarity:", jaccard_sim)

# b. Cosine Similarity
from sklearn.metrics.pairwise import cosine_similarity

tfidf_vec = TfidfVectorizer()
tfidf_matrix = tfidf_vec.fit_transform([text1, text2])
cos_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
print("Cosine Similarity:", cos_sim)

# c. Analysis
print("Analysis: Cosine Similarity is better for long texts with different vocabularies but similar context, while Jaccard is good for short texts or simple word comparisons.")

Q5. Write and analyze a product review
Q5 Solution:
from textblob import TextBlob
from wordcloud import WordCloud
import matplotlib.pyplot as plt

review = "I loved the customer support and the user-friendly interface. The product was amazing!"

# 1. Sentiment Analysis
blob = TextBlob(review)
polarity = blob.sentiment.polarity
subjectivity = blob.sentiment.subjectivity
print("Polarity:", polarity, "Subjectivity:", subjectivity)

# 2. Classification
if polarity > 0.1:
    sentiment = "Positive"
elif polarity < -0.1:
    sentiment = "Negative"
else:
    sentiment = "Neutral"
print("Sentiment:", sentiment)

# 3. Word Cloud for positive reviews
if sentiment == "Positive":
    wordcloud = WordCloud(width=600, height=400, background_color='white').generate(review)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title('Word Cloud for Positive Review')
    plt.show()

Q6. Generate text using LSTM or Dense Model
Q6 Solution:
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
import numpy as np

# Sample paragraph
train_text = "Deep learning is a subset of machine learning that uses neural networks to learn from data and make predictions."

# 1. Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts([train_text])
total_words = len(tokenizer.word_index) + 1
input_sequences = []

# 2. Create input sequences
tokens = tokenizer.texts_to_sequences([train_text])[0]
for i in range(1, len(tokens)):
    seq = tokens[:i+1]
    input_sequences.append(seq)

# Pad sequences
max_len = max([len(x) for x in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_len, padding='pre')

X = input_sequences[:, :-1]
y = input_sequences[:, -1]
y = np.array(y)

# 3. Simple model
model = Sequential()
model.add(Embedding(total_words, 10, input_length=max_len-1))
model.add(LSTM(50))
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')
model.fit(X, y, epochs=500, verbose=0)

# Text generation
seed = "deep"
next_words = 3
for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed])[0]
    token_list = pad_sequences([token_list], maxlen=max_len-1, padding='pre')
    predicted = np.argmax(model.predict(token_list, verbose=0), axis=-1)
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    seed += " " + output_word

print("Generated Text:", seed)
