In [11]:
import time
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.datasets import fetch_20newsgroups

import torch
from torch.utils.data import Dataset, DataLoader

# Automatically download missing NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Custom print functions
def print_timer_info(message):
    print(f"[TIMER INFO] {message}")

def print_output_data(message):
    print(f"[OUTPUT DATA] {message}")

# Load the dataset
start_time = time.time()
print_timer_info("Loading the 20 Newsgroups dataset...")
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
df = pd.DataFrame({'Text': newsgroups.data, 'Category': newsgroups.target})
df['Category Name'] = df['Category'].apply(lambda x: newsgroups.target_names[x])
print_timer_info(f"Time taken to load dataset: {time.time() - start_time:.2f} seconds")

# Pre-processing
print_timer_info("Starting pre-processing...")
preprocess_start_time = time.time()

df['Text'] = df['Text'].str.lower()  # Lowercasing
df['Text'] = df['Text'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))  # Remove punctuation/special characters
df['Tokens'] = df['Text'].apply(word_tokenize)  # Tokenization

stop_words = set(stopwords.words('english'))
df['Tokens'] = df['Tokens'].apply(lambda tokens: [word for word in tokens if word not in stop_words])  # Stopwords removal

stemmer = PorterStemmer()
df['Tokens'] = df['Tokens'].apply(lambda tokens: [stemmer.stem(token) for token in tokens])  # Stemming

print_timer_info(f"Total time for pre-processing: {time.time() - preprocess_start_time:.2f} seconds")

# Print the maximum token length
max_token_length = max(df['Tokens'].apply(len))
print_output_data(f"Maximum length of tokenized text: {max_token_length}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shalo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shalo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[TIMER INFO] Loading the 20 Newsgroups dataset...
[TIMER INFO] Time taken to load dataset: 1.62 seconds
[TIMER INFO] Starting pre-processing...
[TIMER INFO] Total time for pre-processing: 27.92 seconds
[OUTPUT DATA] Maximum length of tokenized text: 6620
