In [2]:
pip install matplotlib

Collecting matplotlib
  Downloading matplotlib-3.10.0-cp312-cp312-macosx_10_13_x86_64.whl.metadata (11 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Downloading contourpy-1.3.1-cp312-cp312-macosx_10_13_x86_64.whl.metadata (5.4 kB)
Collecting cycler>=0.10 (from matplotlib)
  Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Downloading fonttools-4.55.3-cp312-cp312-macosx_10_13_x86_64.whl.metadata (165 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib)
  Downloading kiwisolver-1.4.8-cp312-cp312-macosx_10_13_x86_64.whl.metadata (6.2 kB)
Collecting pillow>=8 (from matplotlib)
  Downloading pillow-11.1.0-cp312-cp312-macosx_10_13_x86_64.whl.metadata (9.1 kB)
Downloading matplotlib-3.10.0-cp312-cp312-macosx_10_13_x86_64.whl (8.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m46.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading contourpy-1.3.1-cp312-cp312-macosx_10_13_x86_64.whl (271 kB)
U

In [3]:
# Import necessary libraries
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import matplotlib.pyplot as plt

# Step 1: Load the dataset
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

# Step 2: Create a DataFrame for better handling
data = pd.DataFrame({
    'Text': newsgroups.data,
    'Category': [newsgroups.target_names[label] for label in newsgroups.target]
})
print(f"Initial size of the dataset: {data.shape[0]} rows")
data = data.dropna()  # Remove rows with null values
print(f"Size after removing null values: {data.shape[0]} rows")

data.to_csv('../Data/20NewsGroup.csv', index=False)


Initial size of the dataset: 18846 rows
Size after removing null values: 18846 rows


In [1]:
# Take a random sample of 500 rows
df_sample = data.sample(n=500, random_state=42)  # random_state ensures reproducibility

# Save the sample to a new CSV file in the same directory
df_sample.to_csv('../Data/20NewsGroup_500.csv', index=False)


In [2]:
df_sample.sample(5)

Unnamed: 0,Text,Category
3776,\n\nI had this problem when I first loaded win...,comp.os.ms-windows.misc
15877,\n\nWhich listsev was this and is the discussi...,soc.religion.christian
14081,# \n# \n# Is there an Xt call to give me my ap...,comp.windows.x
6746,"\nPlease, PAY ATTENTION.\nI, and others, were ...",talk.politics.guns
17115,"\n]The ""corrupted over and over"" theory is pr...",alt.atheism


In [4]:
data["Word Count"] = data["Text"].apply(lambda x: len(str(x).split()))

# Calculate minimum, maximum, and median word counts
min_words = data["Word Count"].min()
max_words = data["Word Count"].max()
median_words = data["Word Count"].median()

print("Word Count Statistics for 'Description':")
print(f"DF shape: {data.shape}")
print(f"Lowest number of words: {min_words}")
print(f"Highest number of words: {max_words}")
print(f"Median number of words: {median_words}\n")


Word Count Statistics for 'Description':
DF shape: (18846, 3)
Lowest number of words: 0
Highest number of words: 11765
Median number of words: 83.0



In [6]:
# Count the number of words in the 'Description' column
data["Word Count"] = data["Text"].apply(lambda x: len(str(x).split()))

# Calculate the 10th percentile and 90th percentile
p10 = data["Word Count"].quantile(0.10)
p90 = data["Word Count"].quantile(0.90)

print(f"10th Percentile (P10): {p10}")
print(f"90th Percentile (P90): {p90}\n")

# Filter the DataFrame to keep only rows between the 10th and 90th percentiles
df_filtered = data[(data["Word Count"] > p10) & (data["Word Count"] < p90)]

# Calculate statistics after filtering
min_words = df_filtered["Word Count"].min()
max_words = df_filtered["Word Count"].max()
median_words = df_filtered["Word Count"].median()

print("Word Count Statistics for 'Description' (After Removing Lowest and Highest 10%):")
print(f"DF shape: {df_filtered.shape}")
print(f"Lowest number of words: {min_words}")
print(f"Highest number of words: {max_words}")
print(f"Median number of words: {median_words}\n")


10th Percentile (P10): 18.0
90th Percentile (P90): 331.0

Word Count Statistics for 'Description' (After Removing Lowest and Highest 10%):
DF shape: (15021, 3)
Lowest number of words: 19
Highest number of words: 330
Median number of words: 83.0

