The NLTK movie_reviews dataset is a well-known text corpus provided by the Natural Language Toolkit (NLTK) that is often used for sentiment analysis and other natural language processing tasks. Here’s an overview:


Collection Size:
Contains 2,000 movie reviews in total.

Labeling:
Reviews are divided equally into two categories: "pos" (positive) and "neg" (negative), making it ideal for binary sentiment classification tasks.

Data Source:
The reviews were originally collected from various online movie review sites and have been pre-processed and organized for research purposes.

Format:
Each review is provided as raw text (a string). In the NLTK corpus, the reviews are accessed via file IDs, and each file belongs to a category (either "pos" or "neg").

In [None]:
import random
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import movie_reviews

# Download the movie_reviews corpus if not already available.
nltk.download("movie_reviews", quiet=True)

# Get the file IDs for positive and negative reviews
pos_ids = movie_reviews.fileids("pos")
neg_ids = movie_reviews.fileids("neg")

# Set a sample size per class (adjust as needed)
sample_size = 100

# Randomly sample file IDs from both classes
sample_pos = random.sample(pos_ids, sample_size)
sample_neg = random.sample(neg_ids, sample_size)

# Collect the review text and their corresponding labels
documents = []
labels = []

for fileid in sample_pos:
    documents.append(movie_reviews.raw(fileid))
    labels.append("pos")
for fileid in sample_neg:
    documents.append(movie_reviews.raw(fileid))
    labels.append("neg")

# Convert text to a TF-IDF feature matrix
vectorizer = TfidfVectorizer(stop_words="english", max_features=1000)
tfidf_matrix = vectorizer.fit_transform(documents)

# Convert the sparse matrix to a dense array for PCA
tfidf_dense = tfidf_matrix.toarray()

# Perform PCA to reduce dimensions to 2 for visualization
pca = PCA(n_components=2)
pca_result = pca.fit_transform(tfidf_dense)

# Map labels to colors for visualization: red for positive, blue for negative
color_map = {"pos": "red", "neg": "blue"}
colors = [color_map[label] for label in labels]

# Plotting the PCA results
plt.figure(figsize=(10, 8))
plt.scatter(pca_result[:, 0], pca_result[:, 1], c=colors, alpha=0.6)

# Optionally annotate a few points with their corresponding label
for i, (x, y) in enumerate(pca_result):
    if i % 15 == 0:  # annotate every 15th point to avoid clutter
        plt.annotate(labels[i], (x, y), textcoords="offset points", xytext=(5, 5))

plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.title("PCA of NLTK Movie Reviews (Sampled)")
plt.grid(True)
plt.show()


In [None]:
import random
import nltk
import pandas as pd
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import movie_reviews

# Download the movie_reviews corpus if not already available.
nltk.download("movie_reviews", quiet=True)

# Retrieve file IDs for positive and negative reviews.
pos_ids = movie_reviews.fileids("pos")
neg_ids = movie_reviews.fileids("neg")

# Set the sample size per class.
sample_size = 50

# Randomly sample file IDs from both classes.
sample_pos = random.sample(pos_ids, sample_size)
sample_neg = random.sample(neg_ids, sample_size)

# Prepare lists to store review text, labels, and hover text (using the first 150 characters).
documents = []
labels = []
hover_text = []

for fileid in sample_pos:
    review_text = movie_reviews.raw(fileid)
    documents.append(review_text)
    labels.append("pos")
    hover_text.append(review_text[:150].replace("\n", " ") + "...")
    
for fileid in sample_neg:
    review_text = movie_reviews.raw(fileid)
    documents.append(review_text)
    labels.append("neg")
    hover_text.append(review_text[:150].replace("\n", " ") + "...")

# Convert reviews into a TF-IDF feature matrix.
vectorizer = TfidfVectorizer(stop_words="english", max_features=1000)
tfidf_matrix = vectorizer.fit_transform(documents)
tfidf_dense = tfidf_matrix.toarray()

# Reduce dimensionality using PCA to two components for visualization.
pca = PCA(n_components=2)
pca_result = pca.fit_transform(tfidf_dense)

# Create a DataFrame to hold the PCA results and related info.
results_df = pd.DataFrame({
    "PCA Component 1": pca_result[:, 0],
    "PCA Component 2": pca_result[:, 1],
    "Label": labels,
    "Review Snippet": hover_text
})

# Create an interactive scatter plot using Plotly.
fig = px.scatter(
    results_df,
    x="PCA Component 1",
    y="PCA Component 2",
    color="Label",
    hover_data=["Review Snippet"],
    title="Interactive PCA of NLTK Movie Reviews"
)

fig.update_traces(marker=dict(size=10, opacity=0.7))
fig.update_layout(xaxis_title="PCA Component 1", yaxis_title="PCA Component 2", hovermode="closest")
fig.write_html("pca_movie_reviews.html")
fig.show()
