In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

In [2]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords

In [12]:
import matplotlib.pyplot as plt
import seaborn as sns

categories = [
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'alt.atheism',
 'soc.religion.christian',
]
dataset = fetch_20newsgroups(subset='train', categories=categories, shuffle=True)

In [13]:
df = pd.DataFrame(dataset.data, columns=["corpus"])
print(df)

                                                 corpus
0     From: ferguson@cs.rochester.edu (George Fergus...
1     From: roger@crux.Princeton.EDU (Roger Lustig)\...
2     From: johnsd2@rpi.edu (Dan Johnson)\nSubject: ...
3     From: tdawson@engin.umich.edu (Chris Herringsh...
4     From: SITUNAYA@IBM3090.BHAM.AC.UK\nSubject: te...
...                                                 ...
3446  From: bobbe@vice.ICO.TEK.COM (Robert Beauchain...
3447  From: galvint@cs.nps.navy.mil (thomas galvin)\...
3448  From: tmc@spartan.ac.BrockU.CA (Tim Ciceran)\n...
3449  From: khettry@r1w2.pub.utk.edu (23064RFL)\nSub...
3450  From: cs902043@ariel.yorku.ca (SHAWN LUDDINGTO...

[3451 rows x 1 columns]


In [14]:
df.describe()

Unnamed: 0,corpus
count,3451
unique,3451
top,From: ferguson@cs.rochester.edu (George Fergus...
freq,1


In [15]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [21]:
import nltk
from nltk.corpus import stopwords
stopwords.words("english")

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [8]:
def preprocess_text(text: str, remove_stopwords: bool) -> str:
    text = re.sub(r"http\S+", "", text)
    text = re.sub("[^A-Za-z]+", " ", text)
    if remove_stopwords:
        tokens = nltk.word_tokenize(text)
        tokens = [w for w in tokens if not w.lower() in stopwords.words("english")]
        text = " ".join(tokens)
    text = text.lower().strip()
    return text

In [22]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [23]:
df['cleaned'] = df['corpus'].apply(lambda x: preprocess_text(x, remove_stopwords=True))
print(df)

                                                 corpus  \
0     From: ferguson@cs.rochester.edu (George Fergus...   
1     From: roger@crux.Princeton.EDU (Roger Lustig)\...   
2     From: johnsd2@rpi.edu (Dan Johnson)\nSubject: ...   
3     From: tdawson@engin.umich.edu (Chris Herringsh...   
4     From: SITUNAYA@IBM3090.BHAM.AC.UK\nSubject: te...   
...                                                 ...   
3446  From: bobbe@vice.ICO.TEK.COM (Robert Beauchain...   
3447  From: galvint@cs.nps.navy.mil (thomas galvin)\...   
3448  From: tmc@spartan.ac.BrockU.CA (Tim Ciceran)\n...   
3449  From: khettry@r1w2.pub.utk.edu (23064RFL)\nSub...   
3450  From: cs902043@ariel.yorku.ca (SHAWN LUDDINGTO...   

                                                cleaned  
0     ferguson cs rochester edu george ferguson subj...  
1     roger crux princeton edu roger lustig subject ...  
2     johnsd rpi edu dan johnson subject accepting j...  
3     tdawson engin umich edu chris herringshaw subj...  
4

In [29]:
vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=5, max_df=0.95)
X = vectorizer.fit_transform(df['cleaned'])

In [30]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(X)
clusters = kmeans.labels_

In [31]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2, random_state=42)
pca_vecs = pca.fit_transform(X.toarray())
x0 = pca_vecs[:, 0]
x1 = pca_vecs[:, 1]

In [34]:
df['cluster'] = clusters
df['x0'] = x0
df['x1'] = x1
print(df)

                                                 corpus  \
0     From: ferguson@cs.rochester.edu (George Fergus...   
1     From: roger@crux.Princeton.EDU (Roger Lustig)\...   
2     From: johnsd2@rpi.edu (Dan Johnson)\nSubject: ...   
3     From: tdawson@engin.umich.edu (Chris Herringsh...   
4     From: SITUNAYA@IBM3090.BHAM.AC.UK\nSubject: te...   
...                                                 ...   
3446  From: bobbe@vice.ICO.TEK.COM (Robert Beauchain...   
3447  From: galvint@cs.nps.navy.mil (thomas galvin)\...   
3448  From: tmc@spartan.ac.BrockU.CA (Tim Ciceran)\n...   
3449  From: khettry@r1w2.pub.utk.edu (23064RFL)\nSub...   
3450  From: cs902043@ariel.yorku.ca (SHAWN LUDDINGTO...   

                                                cleaned  cluster        x0  \
0     ferguson cs rochester edu george ferguson subj...        2 -0.040863   
1     roger crux princeton edu roger lustig subject ...        2 -0.006483   
2     johnsd rpi edu dan johnson subject accepting j...  

In [35]:
def get_top_keywords(n_terms):
    df = pd.DataFrame(X.todense()).groupby(clusters).mean()
    terms = vectorizer.get_feature_names_out() 
    for i,r in df.iterrows():
        print('\nCluster {}'.format(i))
        print(','.join([terms[t] for t in np.argsort(r)[-n_terms:]])) 
            
get_top_keywords(10)


Cluster 0
say,think,jesus,writes,com,would,one,people,edu,god

Cluster 1
nntp,host,posting,file,graphics,university,thanks,com,edu,windows

Cluster 2
baseball,players,university,writes,article,year,ca,game,team,edu


In [36]:
cluster_map = {0: "1", 1: "2", 2: "3"}                          
df['cluster'] = df['cluster'].map(cluster_map)