In [1]:

import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from collections import Counter
import string

nltk.download('punkt')
nltk.download('stopwords')

text = """Technology is evolving rapidly, shaping how we live and work. From smartphones to AI-powered assistants, the digital world continues to transform our daily lives. Cloud computing enables real-time data access from anywhere. Innovations in renewable energy promise a sustainable future. The pace of change is both exciting and challenging."""

clean_text = re.sub(rf"[{re.escape(string.punctuation)}]", "", text.lower())

sentences = sent_tokenize(clean_text)
words_tokenize = word_tokenize(clean_text)
split_words = clean_text.split()

print("Split:", split_words)
print("Word Tokenize:", words_tokenize)

stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words_tokenize if word not in stop_words]

freq_dist = Counter(filtered_words)
print("Word Frequencies:", freq_dist)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\prana\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\prana\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\prana/nltk_data'
    - 'D:\\anacoonda\\nltk_data'
    - 'D:\\anacoonda\\share\\nltk_data'
    - 'D:\\anacoonda\\lib\\nltk_data'
    - 'C:\\Users\\prana\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [3]:

from nltk.stem import PorterStemmer, WordNetLemmatizer
nltk.download('wordnet')

alpha_words = re.findall(r'\b[a-zA-Z]+\b', clean_text)
filtered = [word for word in alpha_words if word not in stop_words]

stemmer = PorterStemmer()
stems = [stemmer.stem(word) for word in filtered]

lemmatizer = WordNetLemmatizer()
lemmas = [lemmatizer.lemmatize(word) for word in filtered]

print("Stemmed:", stems)
print("Lemmatized:", lemmas)


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\prana\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


NameError: name 'stop_words' is not defined

In [5]:

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np

texts = [
    "iPhone 15 has an amazing camera but is very expensive.",
    "The battery life of this laptop is outstanding.",
    "This coffee tastes great and keeps me energized."
]

count_vectorizer = CountVectorizer()
count_matrix = count_vectorizer.fit_transform(texts)

tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(texts)

feature_names = np.array(tfidf.get_feature_names_out())
for i, row in enumerate(tfidf_matrix.toarray()):
    top_indices = row.argsort()[-3:][::-1]
    print(f"Top keywords in text {i+1}:", feature_names[top_indices])


Top keywords in text 1: ['very' 'iphone' 'amazing']
Top keywords in text 2: ['the' 'outstanding' 'of']
Top keywords in text 3: ['energized' 'tastes' 'and']


In [7]:

from sklearn.metrics.pairwise import cosine_similarity

text1 = "Artificial Intelligence allows machines to mimic human behavior."
text2 = "Blockchain ensures secure and decentralized transactions."

def preprocess(text):
    return set(re.findall(r'\b[a-zA-Z]+\b', text.lower()))

set1 = preprocess(text1)
set2 = preprocess(text2)

jaccard = len(set1 & set2) / len(set1 | set2)

vec = TfidfVectorizer()
tfidf_vecs = vec.fit_transform([text1, text2])
cosine = cosine_similarity(tfidf_vecs[0:1], tfidf_vecs[1:2])[0][0]

print("Jaccard:", jaccard)
print("Cosine:", cosine)


Jaccard: 0.0
Cosine: 0.0


In [9]:

from textblob import TextBlob
from wordcloud import WordCloud
import matplotlib.pyplot as plt

review = "This product is absolutely wonderful. The design is sleek, and the performance is outstanding."

blob = TextBlob(review)
polarity = blob.polarity
subjectivity = blob.subjectivity

if polarity > 0.1:
    sentiment = "Positive"
elif polarity < -0.1:
    sentiment = "Negative"
else:
    sentiment = "Neutral"

print("Polarity:", polarity, "| Sentiment:", sentiment)

if sentiment == "Positive":
    wordcloud = WordCloud().generate(review)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()


ModuleNotFoundError: No module named 'textblob'

In [11]:

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
import numpy as np

paragraph = """Machine learning enables computers to learn from data and make decisions. It powers applications like image recognition, speech processing, and predictive analytics. This technology continues to evolve rapidly."""

tokenizer = Tokenizer()
tokenizer.fit_on_texts([paragraph])
word_index = tokenizer.word_index
total_words = len(word_index) + 1

input_sequences = []
for line in paragraph.split('.'):
    tokens = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(tokens)):
        input_sequences.append(tokens[:i+1])

max_len = max(len(seq) for seq in input_sequences)
input_sequences = pad_sequences(input_sequences, maxlen=max_len)

X, y = input_sequences[:, :-1], input_sequences[:, -1]
y = np.eye(total_words)[y]

model = Sequential()
model.add(Embedding(total_words, 10, input_length=max_len-1))
model.add(LSTM(50))
model.add(Dense(total_words, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam')
model.fit(X, y, epochs=300, verbose=0)

seed_text = "machine"
for _ in range(3):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_len-1)
    predicted = np.argmax(model.predict(token_list, verbose=0), axis=-1)[0]
    output_word = [word for word, index in tokenizer.word_index.items() if index == predicted]
    seed_text += " " + output_word[0] if output_word else ""

print("Generated text:", seed_text)


ModuleNotFoundError: No module named 'keras'