In [7]:
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.datasets import fetch_20newsgroups
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

In [15]:

nltk.download('punkt_tab')


print("        Downloading stopwords...")
nltk.download('stopwords', quiet=True)

print("        Downloading tokenizer...")
nltk.download('punkt', quiet=True)

print("        Downloading lemmatizer...")
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

print(" NLTK data ready!")

[nltk_data] Downloading package punkt_tab to C:\Users\Mr.
[nltk_data]     Pandey\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


        Downloading stopwords...
        Downloading tokenizer...
        Downloading lemmatizer...
 NLTK data ready!


In [9]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [10]:
newsgroups = fetch_20newsgroups(
    subset='train',
    remove=('headers', 'footers', 'quotes'),
    random_state=42
)

df = pd.DataFrame({
    'text': newsgroups.data,
    'category': [newsgroups.target_names[label] for label in newsgroups.target],
    'label': newsgroups.target
})

print(f" Loaded {len(df):,} documents")

 Loaded 11,314 documents


In [11]:
sample_idx = 100
sample_text = df.iloc[sample_idx]['text']
sample_category = df.iloc[sample_idx]['category']

print(f"        Category: {sample_category}")
print(f"        {'='*65}")
print(f"        {sample_text[:400]}")
print("        [...continues...]")

        Category: misc.forsale
        1.  Software publishing SuperBase 4 windows v.1.3           --->$80

2.  OCR System ReadRight v.3.1 for Windows                  --->$65

3.  OCR System ReadRight  v.2.01 for DOS                    --->$65

4.  Unregistered Zortech 32 bit C++ Compiler v.3.1          --->$ 250
     with Multiscope windows Debugger,
     WhiteWater Resource Toolkit, Library Source Code

5.  Glockenspiel/ImageSoft Co
        [...continues...]


In [13]:

sample_idx = 100
sample_text = df.iloc[sample_idx]['text']
sample_category = df.iloc[sample_idx]['category']

print("\n Building preprocessing pipeline...")


stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    
    text = text.lower()
    
    
    text = re.sub(r'http\S+|www\S+', '', text)
    
   
    text = re.sub(r'\S+@\S+', '', text)
    
    
    text = re.sub(r'\d+', '', text)
    
    
    text = re.sub(r'[^a-z\s]', '', text)
    
    
    tokens = word_tokenize(text)
    
   
    tokens = [word for word in tokens 
              if word not in stop_words and len(word) > 2]
    
    #  LEMMATIZE
    
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    
    cleaned_text = ' '.join(tokens)
    
    return cleaned_text

print(" Preprocessing function ready!")





 Building preprocessing pipeline...
 Preprocessing function ready!


In [16]:
cleaned_sample = preprocess_text(sample_text)

In [17]:
from tqdm import tqdm
tqdm.pandas(desc="        Progress")
df['cleaned_text'] = df['text'].progress_apply(preprocess_text) 

        Progress: 100%|██████████| 11314/11314 [00:29<00:00, 377.24it/s]


In [18]:
empty_docs = df[df['cleaned_text'].str.strip() == '']
print(f" Empty documents: {len(empty_docs)}")

if len(empty_docs) > 0:
    
    df = df[df['cleaned_text'].str.strip() != '']
   
else:
    print(" No empty documents!")

 Empty documents: 320
