In [3]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from nltk.stem.snowball import SnowballStemmer
from sklearn.metrics.cluster import normalized_mutual_info_score
from sklearn.cluster import SpectralClustering
from sklearn.decomposition import TruncatedSVD

In [4]:
df = pd.read_csv('abstractdata5.csv', sep='#', names=['id', 'class', 'title', 'abstract'])
df

Unnamed: 0,id,class,title,abstract
0,id1,1,Anomaly detection in wide area imagery [Geniş ...,This study is about detecting anomalies in wid...
1,id2,1,Person re-identification with deep kronecker-p...,Person re-identification (re-ID) aims to robus...
2,id3,1,Crack detection in images of masonry using cnns,While there is a significant body of research ...
3,id4,5,Towards an energy efficient code generator for...,Using a smartphone become the part of our ever...
4,id5,5,Sub-polyhedral scheduling using (Unit-)two-var...,Polyhedral compilation has been successful in ...
...,...,...,...,...
1327,id1328,1,Colorimetric point-of-care paper-based sensors...,Creatinine is a clinically significant analyte...
1328,id1329,1,Calcium identification and scoring based on ec...,"Currently, an echocardiography expert is neede..."
1329,id1330,1,Considering filter importance and irreplaceabi...,Deep convolutional neural network (CNNs) have ...
1330,id1331,4,Low-complexity bit-serial sequential polynomia...,GF(2m) multiplication is a complex and perform...


source for punctuation removal: https://stackoverflow.com/questions/39782418/remove-punctuations-in-pandas
source for stopword removal:
https://stackoverflow.com/questions/29523254/python-remove-stop-words-from-pandas-dataframe

In [43]:
def countAndSort(cluster):
  v = CountVectorizer()
  x = v.fit_transform(cluster)

  counts = x.toarray().sum(axis=0)
  features = v.get_feature_names_out()

  d = dict(zip(features, counts))
  sortedByCount = sorted(d.items(), key=lambda x: x[1], reverse=True)
  
  return sortedByCount


# Remove punctuation
df['joined'] = df['title'] + df['abstract']
df['noPunct'] = df['joined'].str.replace('[^\w\s]','')

# Remove digits
df['noPunct'] = df['noPunct'].str.replace('\d+', '')

# Remove unicode
df['noPunct'].str.encode('ascii', 'ignore').str.decode('ascii')

# Lowercase
df['noPunct'] = df['noPunct'].str.lower()

# Remove stopwords
stop = stopwords.words('english')
df['noStopwords'] = df['noPunct'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

  df['noPunct'] = df['joined'].str.replace('[^\w\s]','')
  df['noPunct'] = df['noPunct'].str.replace('\d+', '')


In [44]:
a = countAndSort(df['noStopwords'])
a

[('data', 1385),
 ('using', 939),
 ('proposed', 840),
 ('system', 834),
 ('based', 794),
 ('model', 687),
 ('paper', 683),
 ('results', 594),
 ('method', 590),
 ('used', 563),
 ('performance', 562),
 ('information', 537),
 ('systems', 532),
 ('approach', 527),
 ('learning', 483),
 ('security', 474),
 ('algorithm', 471),
 ('database', 469),
 ('different', 440),
 ('also', 435),
 ('image', 423),
 ('analysis', 420),
 ('design', 409),
 ('time', 406),
 ('detection', 399),
 ('methods', 393),
 ('new', 391),
 ('compiler', 388),
 ('use', 379),
 ('two', 372),
 ('applications', 371),
 ('computer', 367),
 ('network', 360),
 ('show', 351),
 ('however', 341),
 ('key', 341),
 ('work', 331),
 ('quantum', 330),
 ('images', 329),
 ('vision', 329),
 ('research', 328),
 ('study', 320),
 ('control', 318),
 ('relational', 315),
 ('algorithms', 312),
 ('process', 304),
 ('framework', 302),
 ('one', 293),
 ('problem', 285),
 ('present', 279),
 ('techniques', 278),
 ('scheme', 276),
 ('accuracy', 272),
 ('code'

In [6]:
# Stem
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()

def stem_text(text):
    ps = SnowballStemmer('english')
    return " ".join([ps.stem(w) for w in w_tokenizer.tokenize(text)])

df['preprocessed'] = df['noStopwords'].apply(stem_text)

In [109]:
a = countAndSort(df['preprocessed'])
a

[('use', 2024),
 ('data', 1385),
 ('system', 1368),
 ('propos', 1238),
 ('model', 1061),
 ('method', 983),
 ('comput', 979),
 ('perform', 891),
 ('base', 826),
 ('imag', 805),
 ('algorithm', 798),
 ('robot', 789),
 ('result', 771),
 ('secur', 737),
 ('paper', 711),
 ('approach', 699),
 ('databas', 687),
 ('applic', 673),
 ('design', 641),
 ('develop', 597),
 ('compil', 589),
 ('present', 582),
 ('inform', 560),
 ('process', 558),
 ('detect', 555),
 ('learn', 553),
 ('implement', 547),
 ('network', 540),
 ('differ', 523),
 ('provid', 520),
 ('relat', 517),
 ('show', 514),
 ('improv', 505),
 ('time', 498),
 ('techniqu', 484),
 ('control', 480),
 ('optim', 472),
 ('studi', 461),
 ('program', 460),
 ('also', 435),
 ('effici', 433),
 ('work', 427),
 ('evalu', 424),
 ('generat', 423),
 ('analysi', 420),
 ('problem', 411),
 ('scheme', 410),
 ('research', 395),
 ('new', 391),
 ('requir', 385),
 ('object', 382),
 ('compar', 378),
 ('featur', 378),
 ('key', 374),
 ('two', 372),
 ('effect', 364),

In [129]:
# tf-idf
v = TfidfVectorizer(max_df=0.25, min_df=1, ngram_range=(1,2), stop_words=['data','use', 'propos'])
x = v.fit_transform(df['preprocessed'])

In [111]:
v.stop_words_

{'polynomialtim interv',
 'develop empir',
 'semwebnet userfriend',
 'floor human',
 'respons extern',
 'account weight',
 'process feasibl',
 'organ databas',
 'search ns',
 'inform processor',
 'featur raw',
 'evidenc number',
 'also morpholog',
 'process simplifi',
 'weight recurr',
 'client end',
 'convent displac',
 'prefer store',
 'process way',
 'cryptograph role',
 'one qubit',
 'flight number',
 'subsampl pixel',
 'kelvin voigtgener',
 'newgener relat',
 'spoof prevent',
 'elbow',
 'select flexibl',
 'secret lwe',
 'therebi construct',
 'compil static',
 'cyber health',
 'vesselspe',
 'featur extractionth',
 'imag msi',
 'support user',
 'import artifici',
 'reconfigur shapemorph',
 'upper lower',
 'monitor scenario',
 'assist advanc',
 'moreov metric',
 'collabor compil',
 'specif detect',
 'current major',
 'tls method',
 'support opportun',
 'understand reviewsinc',
 'almost half',
 'forget problem',
 'yield poor',
 'sensor fulfil',
 'modul ntt',
 'captur environment',
 'l

In [112]:
# Calculate NMI
clustering = KMeans(n_clusters=5, n_init=10, random_state=2).fit(x)

labels = clustering.labels_

normalized_mutual_info_score(labels, df['class'])

0.7721250155982129

In [135]:
# LSA
lsaApplied = TruncatedSVD(n_components=100, random_state=5).fit_transform(x)

In [136]:
# Spectral clustering with cosine distance. Laplacian ?
clustering = SpectralClustering(n_clusters=5, affinity='cosine', random_state=2, assign_labels='discretize', ).fit(lsaApplied)

labels = clustering.labels_

normalized_mutual_info_score(labels, df['class'], )

0.8113642267466998

In [11]:
df["clusteringLabel"] = labels


In [137]:
cluster0 = df[df['clusteringLabel'] == 0]
cluster1 = df[df['clusteringLabel'] == 1]
cluster2 = df[df['clusteringLabel'] == 2]
cluster3 = df[df['clusteringLabel'] == 3]
cluster4 = df[df['clusteringLabel'] == 4]

c0 = countAndSort(cluster0['preprocessed'])[0:30]
c1 = countAndSort(cluster1['preprocessed'])[0:30]
c2 = countAndSort(cluster2['preprocessed'])[0:30]
c3 = countAndSort(cluster3['preprocessed'])[0:30]
c4 = countAndSort(cluster4['preprocessed'])[0:30]

print(
  "cluster0:\n", c0, "\n------------------------------------------\n",
  "cluster1:\n", c1, "\n------------------------------------------\n",
  "cluster2:\n", c2, "\n------------------------------------------\n",
  "cluster3:\n", c3, "\n------------------------------------------\n",
  "cluster4:\n", c4, "\n------------------------------------------\n")

# cluster0 databases
# cluster1 datascience(datamining, machinelearning)???
# cluster2 information security
# cluster3 robotics
# cluster4 programming


cluster0:
 [('use', 757), ('imag', 644), ('method', 608), ('propos', 505), ('detect', 481), ('model', 480), ('system', 430), ('comput', 394), ('perform', 353), ('result', 324), ('network', 323), ('learn', 317), ('data', 308), ('vision', 307), ('base', 297), ('algorithm', 296), ('approach', 267), ('object', 264), ('featur', 245), ('dataset', 243), ('deep', 233), ('accuraci', 227), ('paper', 227), ('differ', 216), ('studi', 213), ('applic', 211), ('improv', 202), ('develop', 195), ('research', 195), ('process', 192)] 
------------------------------------------
 cluster1:
 [('compil', 565), ('program', 369), ('use', 348), ('comput', 275), ('code', 253), ('languag', 224), ('optim', 206), ('system', 205), ('implement', 179), ('applic', 177), ('paper', 170), ('algorithm', 161), ('design', 154), ('perform', 151), ('memori', 149), ('graph', 145), ('parallel', 145), ('present', 137), ('time', 135), ('approach', 133), ('base', 131), ('oper', 129), ('data', 127), ('techniqu', 124), ('model', 123)