In [1]:
import nltk
import re
import pandas as pd
import numpy as np
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
from sklearn.cluster import KMeans, SpectralClustering
from sklearn.metrics import normalized_mutual_info_score
from matplotlib import pyplot as plt

In [2]:
# # needed for running the notebook, uncomment if running for the first time
# nltk.download('punkt')
# nltk.download('stopwords')
# !pip install wordcloud

In [3]:
# load data
data = pd.read_csv('abstractdata5.csv', sep = '#', header = None)
data.set_axis(['id', 'class', 'title', 'abstract'], axis = 1, inplace = True)

### 1. Baseline approach

In [4]:
# combine title and abstract and tokenize
df = pd.DataFrame()
df['text'] = data['title'] + " " + data['abstract']
df['tokenized'] = df.apply(lambda row: nltk.word_tokenize(row['text']), axis=1)

In [5]:
# remove stopwords
stop_words = set(stopwords.words('english'))
df['preprocessed'] = df['tokenized'].apply(lambda x: [item.lower() for item in x if item not in stop_words])

In [6]:
# stem
porter = PorterStemmer()
df['preprocessed'] = df['preprocessed'].apply(lambda x: [porter.stem(word) for word in x])
df['preprocessed'] = df['preprocessed'].str.join(" ")

In [7]:
# tf-idf representation
vectorizer = TfidfVectorizer()
df_tfidf = vectorizer.fit_transform(df['preprocessed'])

In [8]:
# apply k-means
kmeans = KMeans(n_clusters = 5, random_state = 42)
kpreds = kmeans.fit_predict(df_tfidf)

In [9]:
# NMI score
baseline_score = normalized_mutual_info_score(data['class'], kpreds, average_method = 'geometric')
print("Baseline NMI:", np.round(baseline_score, 3))

Baseline NMI: 0.691


### 2.  Spectral clustering

In [10]:
df1 = pd.DataFrame()
df1['text'] = data['title'] + " " + data['abstract']

# load custom stopword file
# taken from https://github.com/stopwords-iso/stopwords-en/blob/master/stopwords-en.txt
file = open("stopwords-en.txt", "r", encoding = 'utf-8')
content = file.readlines()
stop_words = [c.rstrip('\n') for c in content]

In [11]:
# initial character substitution
df1['text'] = df1['text'].apply(lambda x: re.sub(r"http\S+", "", x))
df1['text'] = df1['text'].apply(lambda x: x.replace("/", " "))
df1['text'] = df1['text'].apply(lambda x: x.replace("\\", " "))
df1['text'] = df1['text'].apply(lambda x: x.replace("-", " "))

# tokenize
df1['tokenized'] = df1.apply(lambda row: nltk.word_tokenize(row['text']), axis = 1)

In [12]:
# remove words with non-ascii chars
def is_ascii(w):
    try:
        w.encode().decode("us-ascii")
        return True
    except UnicodeDecodeError:
        return False

df1['preprocessed'] = df1['tokenized'].apply(lambda x: [item.lower() for item in x if is_ascii(item)])

# remove stopwords, punctuation, digits
df1['preprocessed'] = df1['preprocessed'].apply(lambda x: [item for item in x if item not in stop_words])
df1['preprocessed'] = df1['preprocessed'].apply(lambda x: [item for item in x if item not in string.punctuation])
df1['preprocessed'] = df1['preprocessed'].apply(lambda x: [item for item in x if not any(c.isdigit() for c in item)])

In [13]:
# stemming
snowball = SnowballStemmer(language='english')
df1['preprocessed'] = df1['preprocessed'].apply(lambda x: [snowball.stem(word) for word in x])
df1['preprocessed'] = df1['preprocessed'].str.join(" ")

In [14]:
# tf-idf representation
cv = CountVectorizer(ngram_range = (1, 3), min_df = 0.003)
tfidf = TfidfTransformer()

df1_cv = cv.fit_transform(df1['preprocessed'])
df1_tfidf = tfidf.fit_transform(df1_cv)

In [15]:
# perform SVD and normalization
svd = TruncatedSVD(n_components = 5, random_state = 42)
normalizer = Normalizer()
lsa = make_pipeline(svd, normalizer)

df1_tfidf_transformed = lsa.fit_transform(df1_tfidf)

In [16]:
# apply KMeans and spectral clustering
spectral = SpectralClustering(n_clusters = 5, random_state = 42, affinity = 'rbf', gamma = 0.05)

kpreds1 = kmeans.fit_predict(df1_tfidf_transformed)
spreds = spectral.fit_predict(df1_tfidf_transformed)

In [17]:
# NMI scores
kscore = normalized_mutual_info_score(data['class'], kpreds1, average_method = 'geometric')
sscore = normalized_mutual_info_score(data['class'], spreds, average_method = 'geometric')

print("Kmeans NMI:", np.round(kscore, 3))
print("Spectral clustering NMI:", np.round(sscore, 3))

Kmeans NMI: 0.813
Spectral clustering NMI: 0.815


### 3. Topics

In [18]:
tf = pd.DataFrame(df1_cv.toarray(), columns = cv.get_feature_names_out())

for i in range(5):
    print("Topic", i + 1)
    
    words = tf[spreds == i].sum().sort_values(ascending=False)[:15]
    print(words, "\n\n")

Topic 1
robot       862
control     339
base        197
perform     190
model       181
task        158
propos      151
develop     144
design      142
learn       135
approach    129
method      121
environ     119
soft        116
studi       115
dtype: int64 


Topic 2
databas          694
data             607
relat            389
queri            279
relat databas    257
model            211
base             195
approach         162
propos           141
process          138
paper            116
system           116
sql              111
manag            109
develop          108
dtype: int64 


Topic 3
secur           697
base            401
propos          368
scheme          335
data            324
encrypt         309
key             304
quantum         257
cryptographi    251
protocol        240
comput          235
attack          231
algorithm       214
implement       188
effici          159
dtype: int64 


Topic 4
compil       631
program      372
code         264
comput       2