## This code demonstrates a simple NLP application, including data preprocessing, tokenization, stemming, lemmatization, and text classification using the Bag of Words and TF-IDF methods. This example uses the scikit-learn, nltk, and pandas libraries.

The application is based on sentiment analysis, where we classify movie reviews as positive or negative.

In [1]:
pip install nltk scikit-learn pandas




In [2]:
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Download necessary NLTK data
# Include 'punkt_tab' in the downloads
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt_tab', quiet=True) # Download the Punkt Sentence Tokenizer

# Example dataset: Movie reviews (positive/negative)
data = {
    'text': ["I love this movie", "This was a terrible movie", "Best movie ever!", "Worst movie I have ever seen",
             "I enjoyed this movie", "It was a horrible movie", "Amazing movie", "I hated this movie",
             "This movie was fantastic", "Terrible movie"],
    'label': [1, 0, 1, 0, 1, 0, 1, 0, 1, 0]  # 1 = Positive, 0 = Negative
}

# Load data into DataFrame
df = pd.DataFrame(data)

# Preprocessing: Lowercasing and removing stopwords
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess_text(text, use_stemming=True, use_lemmatization=False):
    # Tokenize text
    words = word_tokenize(text.lower())  # Convert to lowercase and tokenize
    # Remove stopwords
    words = [word for word in words if word.isalpha() and word not in stop_words]

    if use_stemming:
        # Apply stemming
        words = [stemmer.stem(word) for word in words]
    elif use_lemmatization:
        # Apply lemmatization
        words = [lemmatizer.lemmatize(word) for word in words]

    return ' '.join(words)

# Apply text preprocessing (choose stemming or lemmatization)
df['processed_text'] = df['text'].apply(lambda x: preprocess_text(x, use_stemming=True))

# Vectorization using TF-IDF (you can also use CountVectorizer if preferred)
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['processed_text'])

# Target variable
y = df['label']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Check the shape of the splits to ensure it isn't too small for training
print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")

# Model: Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluation: Accuracy Score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Test prediction on new text
new_text = "I absolutely loved this film!"
processed_new_text = preprocess_text(new_text, use_stemming=True)
X_new = vectorizer.transform([processed_new_text])
prediction = model.predict(X_new)
print(f"Sentiment of the new review: {'Positive' if prediction[0] == 1 else 'Negative'}")

Training set size: (7, 12)
Test set size: (3, 12)
Accuracy: 66.67%
Sentiment of the new review: Positive
