# Step-by-Step Exploratory Data Analysis (EDA)
## Step 1: Load the dataset

This gives an overview of the dataset's structure: number of entries, data types, etc.

In [None]:
from sklearn.datasets import fetch_20newsgroups
import pandas as pd

# Load the 20 Newsgroups dataset
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

# Convert to a DataFrame
df = pd.DataFrame({'text': newsgroups.data, 'target': newsgroups.target, 'target_names': [newsgroups.target_names[i] for i in newsgroups.target]})

# Display basic info
df.info()

In [None]:
print(df.head())

In [None]:
from tabulate import tabulate

print(tabulate(df.tail(), headers='keys', tablefmt='psql'))

## Step 2: Check for missing or null values

You might find some missing text data. If there are missing values, decide whether to fill or drop them depending on the extent of the missing data.

In [None]:
# Check for missing values
df.isnull().sum()

## Step 3: Summary of text lengths

This provides insights into the distribution of text lengths. You may use visualizations like histograms to better understand the variation in document lengths.

In [None]:
# Add a column for text length
df['text_length'] = df['text'].apply(len)

# Summary statistics of text length
df['text_length'].describe()

## Step 4: Visualize text length distribution

This histogram gives you an idea of the typical document length and if there are any extremely short or long documents.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plot the distribution of text lengths
plt.figure(figsize=(10,6))
sns.histplot(df['text_length'], bins=50, kde=True)
plt.title('Distribution of Document Lengths')
plt.xlabel('Text Length')
plt.ylabel('Frequency')
plt.show()

## Step 5: Analyze the distribution of topics (categories)
This count plot helps visualize the balance (or imbalance) between the different categories/topics in the dataset.

In [None]:
# Plot the distribution of topics
plt.figure(figsize=(12,6))
sns.countplot(df['target_names'])
plt.xticks(rotation=90)
plt.title('Distribution of Topics in 20 Newsgroups')
plt.xlabel('Topic')
plt.ylabel('Number of Documents')
plt.show()

## Step 6: Most frequent words (before preprocessing)

This returns the 20 most common words across all documents. This gives you a sense of which terms are frequently used before any text cleaning is applied.

In [None]:
from collections import Counter
import re

# Define a function to get the most frequent words in the dataset
def get_most_common_words(texts, num_words=20):
    all_words = ' '.join(texts).lower()
    all_words = re.sub(r'[^a-zA-Z\s]', '', all_words)  # Remove punctuation and non-alphabet characters
    word_list = all_words.split()
    word_freq = Counter(word_list)
    return word_freq.most_common(num_words)

# Get the most frequent words
common_words = get_most_common_words(df['text'], num_words=20)
common_words

## Step 7: Word Cloud Visualization

In [None]:
from wordcloud import WordCloud

# Create a word cloud of the most frequent words
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(df['text']))

# Display the word cloud
plt.figure(figsize=(10,6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of the 20 Newsgroups Dataset')
plt.show()

## Preprocess Text

In [None]:
import re
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

stop_words = stopwords.words('english')

# Define text preprocessing function
def preprocess_text(text):
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    # Convert to lowercase
    text = text.lower()
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Apply the preprocessing function to the dataset
df['cleaned_text'] = df['text'].apply(preprocess_text)

# Display the first few cleaned texts
df[['text', 'cleaned_text']].head()


In [None]:
# Get the most common words after preprocessing
common_words_cleaned = get_most_common_words(df['cleaned_text'], num_words=20)
common_words_cleaned

In [None]:
from wordcloud import WordCloud

# Create a word cloud of the most frequent words after preprocessing
wordcloud_cleaned = WordCloud(width=800, height=400, background_color='white').generate(' '.join(df['cleaned_text']))

# Display the word cloud
plt.figure(figsize=(10,6))
plt.imshow(wordcloud_cleaned, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Cleaned 20 Newsgroups Dataset')
plt.show()


## Preprocess Text (2.0) - This time we will remove tokens that are 1 or 2 characters long.

In [None]:
# Define updated text preprocessing function
def preprocess_text(text):
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    # Convert to lowercase
    text = text.lower()
    # Remove stopwords and words that are 1-2 characters long
    text = ' '.join([word for word in text.split() if word not in stop_words and len(word) > 2])
    return text

# Apply the updated preprocessing function to the dataset
df['cleaned_text'] = df['text'].apply(preprocess_text)

# Display the first few cleaned texts
df[['text', 'cleaned_text']].head()

In [None]:
# Get the most common words after preprocessing
common_words_cleaned = get_most_common_words(df['cleaned_text'], num_words=20)
common_words_cleaned

In [None]:
# Create a word cloud of the most frequent words after preprocessing
wordcloud_cleaned = WordCloud(width=800, height=400, background_color='white').generate(' '.join(df['cleaned_text']))

# Display the word cloud
plt.figure(figsize=(10,6))
plt.imshow(wordcloud_cleaned, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Cleaned 20 Newsgroups Dataset')
plt.show()


# In-class Exercise #1 (15 mins)

## 1. Define a function that get the most common words by ``category``
## 2. Define a function that generates a word cloud by ``category``

In [None]:
# Define a function to get most common words by category
def get_common_words_by_category(category_name):
    category_text = df[df['target_names'] == category_name]['cleaned_text']
    return get_most_common_words(category_text, num_words=10)

# Example: Most common words in 'sci.space' category
get_common_words_by_category('sci.space')

In [None]:
# Function to generate a word cloud for a specific category
def generate_wordcloud_by_category(category_name):
    # Filter the dataset by the selected category
    category_text = df[df['target_names'] == category_name]['cleaned_text'].str.cat(sep=' ')

    # Generate a word cloud for the filtered text
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(category_text)

    # Display the word cloud
    plt.figure(figsize=(10,6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'Word Cloud for Category: {category_name}')
    plt.show()

# Example: Generate word cloud for 'sci.space' category
generate_wordcloud_by_category('sci.space')

In [None]:
# Word cloud for 'comp.graphics' category
generate_wordcloud_by_category('comp.graphics')


In [None]:

# Word cloud for 'talk.politics.misc' category
generate_wordcloud_by_category('talk.politics.misc')

## Step 8: Analyze text by category
This gives insights into which categories have longer or shorter documents on average.

In [None]:
# Calculate the average text length by category
avg_length_by_category = df.groupby('target_names')['text_length'].mean().sort_values(ascending=False)

# Plot the average text length by category
plt.figure(figsize=(12,6))
avg_length_by_category.plot(kind='bar')
plt.title('Average Text Length by Category')
plt.ylabel('Average Text Length')
plt.show()

## Step 9: Top Words by Category
Let's take a look at the most frequent words in each category after preprocessing.

In [None]:
from collections import Counter

# Function to get the top words by category
def get_top_words_by_category(category_name, num_words=10):
    category_text = df[df['target_names'] == category_name]['cleaned_text']
    all_words = ' '.join(category_text).split()
    word_freq = Counter(all_words)
    return word_freq.most_common(num_words)

# Example: Top words in 'sci.space' category
get_top_words_by_category('sci.space', num_words=10)

## Step 10: Document Similarity (Cosine Similarity)

We can analyze how similar documents are to each other by computing cosine similarity between the documents. This gives us insights into whether documents within a category tend to be more similar.

This heatmap shows the similarity between the first 100 documents in the dataset. It can help identify whether some documents are closely related to others.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Vectorize the cleaned text using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['cleaned_text'])

# Compute cosine similarity between documents
cosine_sim = cosine_similarity(tfidf_matrix)

# Visualize the cosine similarity matrix for the first 100 documents
plt.figure(figsize=(10,8))
sns.heatmap(cosine_sim[:100, :100], cmap='coolwarm')
plt.title('Cosine Similarity Between Documents (First 100)')
plt.show()


## Step 10.1: Text Similarity between Categories

The above heatmap is not very informative. Let's compare specific categories

To measure the **cosine similarity** between two specific categories, we need to follow these steps:
- Filter the dataset for the two categories of interest.
- Vectorize the documents using TF-IDF.
- Compute cosine similarity between the averaged TF-IDF vectors for the two categories.

### Step 1: Filter the Data by Category
We will filter the dataset to extract the documents for two categories (e.g., ``sci.space`` and ``comp.graphics``).

In [None]:
# Filter documents for 'sci.space' and 'comp.graphics' categories
category_1 = df[df['target_names'] == 'sci.space']['cleaned_text']
category_2 = df[df['target_names'] == 'comp.graphics']['cleaned_text']

### Step 2: Vectorize the Documents Using TF-IDF
We need to vectorize the documents from each category using the same TF-IDF vectorizer to make their features comparable.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')

# Combine the text from both categories for fitting the vectorizer
combined_text = pd.concat([category_1, category_2])

# Fit and transform the text
tfidf_matrix = tfidf_vectorizer.fit_transform(combined_text)

# Separate the transformed matrices for the two categories
tfidf_category_1 = tfidf_matrix[:len(category_1)]
tfidf_category_2 = tfidf_matrix[len(category_1):]


### Step 3: Compute Average TF-IDF Vectors for Each Category
To compare categories, we can calculate the **average TF-IDF vector** for each category and then compute the cosine similarity between them.

In [None]:
import numpy as np

# Compute the mean TF-IDF vector for each category
mean_tfidf_category_1 = np.mean(tfidf_category_1.toarray(), axis=0)
mean_tfidf_category_2 = np.mean(tfidf_category_2.toarray(), axis=0)

### Step 4: Compute Cosine Similarity Between the Two Categories
Finally, we compute the cosine similarity between the two average TF-IDF vectors.

The resulting cosine similarity value (between 0 and 1) gives us a measure of how similar the two categories are in terms of their TF-IDF vector representations. A value closer to 1 indicates that the two categories share more similar word distributions, while a value closer to 0 indicates less similarity.

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Reshape the mean vectors for cosine similarity computation
mean_tfidf_category_1 = mean_tfidf_category_1.reshape(1, -1)
mean_tfidf_category_2 = mean_tfidf_category_2.reshape(1, -1)

# Compute cosine similarity between the two categories
cosine_sim_between_categories = cosine_similarity(mean_tfidf_category_1, mean_tfidf_category_2)

# Output the cosine similarity
cosine_sim_between_categories[0][0]


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Filter documents for 'sci.space' and 'comp.graphics' categories
category_1 = df[df['target_names'] == 'sci.space'].head(10)  # Select first 10 docs from sci.space
category_2 = df[df['target_names'] == 'comp.graphics'].head(10)  # Select first 10 docs from comp.graphics

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')

# Combine the text from both categories for fitting the vectorizer
combined_text = pd.concat([category_1, category_2])

# Fit and transform the text
tfidf_matrix = tfidf_vectorizer.fit_transform(combined_text['cleaned_text'])

# Separate the transformed matrices for the two categories
tfidf_category_1 = tfidf_matrix[:len(category_1)]
tfidf_category_2 = tfidf_matrix[len(category_1):]

# Compute pairwise cosine similarity between the two categories
cosine_sim_matrix = cosine_similarity(tfidf_category_1, tfidf_category_2)

# Visualize the cosine similarity matrix using a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(cosine_sim_matrix, annot=True, cmap='coolwarm', xticklabels=False, yticklabels=False)
plt.title("Cosine Similarity between Documents from 'sci.space' and 'comp.graphics'")
plt.xlabel("Category 2: comp.graphics")
plt.ylabel("Category 1: sci.space")
plt.show()


# Topic Modeling

## Step 1: Install Required Libraries

In [None]:
pip install numpy pandas scikit-learn nltk pyLDAvis

## Step 2: Load the Dataset

We'll use the ``fetch_20newsgroups`` function from ``scikit-learn`` to load the dataset.

In [None]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups

# Load the dataset
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
documents = newsgroups.data


## Step 3: Text Preprocessing

We need to preprocess the text by tokenizing, removing stop words, and stemming/lemmatizing the words.

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

# Download NLTK resources
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize the text
    tokens = text.lower().split()
    # Remove stop words and stem the words
    ps = PorterStemmer()
    tokens = [ps.stem(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Preprocess the documents
preprocessed_documents = [preprocess_text(doc) for doc in documents]


## Step 4: Vectorization
Convert the preprocessed text into a numerical format using the TF-IDF vectorizer.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=1000)  # Limit to top 1000 features
X = vectorizer.fit_transform(preprocessed_documents)

## Step 5: Topic Modeling with LDA
We'll use LDA for topic modeling on the TF-IDF matrix.

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

# Set the number of topics
num_topics = 10
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda.fit(X)


## Step 6: Display Topics
Now let's display the topics along with the top words associated with each topic.

In [None]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx + 1}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

# Get feature names and display topics
feature_names = vectorizer.get_feature_names_out()
no_top_words = 10
display_topics(lda, feature_names, no_top_words)


## Step 7: Visualizing the Topics

To visualize the topics, we can use ``pyLDAvis``.

In [None]:
import pyLDAvis
import pyLDAvis.lda_model

# Prepare LDA visualization
pyLDAvis.enable_notebook()
vis = pyLDAvis.lda_model.prepare(lda, X, vectorizer, mds='tsne')
pyLDAvis.display(vis)
