<h3>NLP</h3>

In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TreebankWordTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline, FunctionTransformer

# Download only stopwords (safe)
nltk.download("stopwords")

stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()
tokenizer = TreebankWordTokenizer()   # does NOT need punkt

def preprocess(texts):
    cleaned = []
    for text in texts:
        tokens = tokenizer.tokenize(text.lower())
        tokens = [t for t in tokens if t.isalpha()]
        tokens = [t for t in tokens if t not in stop_words]
        tokens = [stemmer.stem(t) for t in tokens]
        cleaned.append(" ".join(tokens))
    return cleaned

# NLP Pipeline
nlp_pipeline = Pipeline([
    ("clean", FunctionTransformer(preprocess)),
    ("tfidf", TfidfVectorizer()),
    ("model", MultinomialNB())
])

# Sample data
texts = [
    "I love this product",
    "This is the worst item ever",
    "Really happy with the purchase",
    "I hate this so much"
]

labels = [1, 0, 1, 0]

# Train and test
nlp_pipeline.fit(texts, labels)

test_text = ["I really love this phone"]
print("Prediction:", nlp_pipeline.predict(test_text)[0])


Prediction: 1


[nltk_data] Downloading package stopwords to C:\Users\SAKSHI
[nltk_data]     BADOLA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
