In [19]:
import nltk
import os
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

# Setup NLTK data path
nltk_data_dir = '/Users/shivrajchaudar/nltk_data'
nltk.data.path.append(nltk_data_dir)

# Downloads
nltk.download('punkt', download_dir=nltk_data_dir)
nltk.download('stopwords', download_dir=nltk_data_dir)
nltk.download('wordnet', download_dir=nltk_data_dir)
nltk.download('averaged_perceptron_tagger', download_dir=nltk_data_dir)

print("NLTK Data Files:", os.listdir(nltk_data_dir))

NLTK Data Files: ['tokenizers', 'taggers', 'corpora']


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/shivrajchaudar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shivrajchaudar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/shivrajchaudar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/shivrajchaudar/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [20]:
text = """Albert Einstein was a theoretical physicist who developed the theory of relativity.
He was born in Germany in 1879 and is considered one of the most influential physicists."""

In [21]:
# Sentence Tokenization
sentences = sent_tokenize(text)
print("Sentence Tokenization:", sentences)

# Word Tokenization
words = word_tokenize(text)
print("Word Tokenization:", words)

Sentence Tokenization: ['Albert Einstein was a theoretical physicist who developed the theory of relativity.', 'He was born in Germany in 1879 and is considered one of the most influential physicists.']
Word Tokenization: ['Albert', 'Einstein', 'was', 'a', 'theoretical', 'physicist', 'who', 'developed', 'the', 'theory', 'of', 'relativity', '.', 'He', 'was', 'born', 'in', 'Germany', 'in', '1879', 'and', 'is', 'considered', 'one', 'of', 'the', 'most', 'influential', 'physicists', '.']


In [22]:
# Removing stopwords
stop_words = set(stopwords.words('english'))
filtered_words = [w for w in words if w.lower() not in stop_words]
print("Filtered Words:", filtered_words)

Filtered Words: ['Albert', 'Einstein', 'theoretical', 'physicist', 'developed', 'theory', 'relativity', '.', 'born', 'Germany', '1879', 'considered', 'one', 'influential', 'physicists', '.']


In [23]:
# Stemming
stemmer = PorterStemmer()
stemmed = [stemmer.stem(w) for w in filtered_words]
print("Stemmed Words:", stemmed)

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(w) for w in filtered_words]
print("Lemmatized Words:", lemmatized)

Stemmed Words: ['albert', 'einstein', 'theoret', 'physicist', 'develop', 'theori', 'rel', '.', 'born', 'germani', '1879', 'consid', 'one', 'influenti', 'physicist', '.']
Lemmatized Words: ['Albert', 'Einstein', 'theoretical', 'physicist', 'developed', 'theory', 'relativity', '.', 'born', 'Germany', '1879', 'considered', 'one', 'influential', 'physicist', '.']


In [24]:
# POS Tagging
pos_tags = nltk.pos_tag(filtered_words)
print("POS Tagging:", pos_tags)

POS Tagging: [('Albert', 'NNP'), ('Einstein', 'NNP'), ('theoretical', 'JJ'), ('physicist', 'NN'), ('developed', 'VBD'), ('theory', 'JJ'), ('relativity', 'NN'), ('.', '.'), ('born', 'VBN'), ('Germany', 'NNP'), ('1879', 'CD'), ('considered', 'VBD'), ('one', 'CD'), ('influential', 'JJ'), ('physicists', 'NNS'), ('.', '.')]


In [25]:
# Sample dataset
docs = ["Einstein developed the theory of relativity.",
        "Physics is the study of matter and energy.",
        "Relativity changed how we understand space and time."]

# Labels for the sample data (e.g., 0 = biography, 1 = science)
labels = [0, 1, 1]

# TF-IDF Vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(docs)

# Splitting manually (small sample)
X_train, X_test = X[:2], X[2:]
y_train, y_test = labels[:2], labels[2:]

# Model Training
model = MultinomialNB()
model.fit(X_train, y_train)

# Prediction
y_pred = model.predict(X_test)
print("Predicted Labels:", y_pred)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Predicted Labels: [0]
Accuracy: 0.0
