In [1]:
import pandas as pd

abstracts = pd.read_csv(
    r'C:\Users\mysmu\Desktop\Natural Language Processing\nlp-cse-uoi-2025\data_new\abstracts.txt',
    sep=r'\|\-\-\|',      # regex για το '|--|'
    engine='python',      # απαραίτητο για multi-char regex sep
    header=None,          # αν δεν έχει header
    names=['paper','abstract']  # όρισε ονόματα στη λίστα αν θέλεις
)

print(abstracts.head())


   paper                                           abstract
0      0  The development of an automated system for the...
1      1  This paper proposes a novel hybrid forward alg...
2      2  Modern CCD cameras are usually capable of a sp...
3      3  This paper deals with the problem of fuzzy non...
4      4  A number of neural networks can be formulated ...


In [2]:
num_nan = abstracts['abstract'].isna().sum()
print(f"Αριθμός NaN abstracts: {num_nan}")


Αριθμός NaN abstracts: 7249


In [3]:
# Δημιουργούμε μάσκα για όλες τις περιπτώσεις
mask_blank = abstracts['abstract'].isna() | (abstracts['abstract'].str.strip() == '')
total_blank = mask_blank.sum()
print(f"Συνολικός αριθμός papers με κενό abstract: {total_blank}")

blanks = abstracts[mask_blank]
print(blanks.head())


Συνολικός αριθμός papers με κενό abstract: 7249
     paper abstract
76      76      NaN
242    242      NaN
245    245      NaN
249    249      NaN
254    254      NaN


In [4]:
# παράδειγμα υπολογισμού στατιστικών
df = abstracts.copy()
# καθαρισμός: κρατάμε μόνο non-null
docs = df['abstract'].dropna().astype(str)

# 1. μήκη
length_chars = docs.str.len()
length_tokens = docs.str.split().apply(len)

# 2. βασικά στατιστικά
print(length_tokens.describe())
print(length_chars.describe())

# 3. λεξιλόγιο
all_tokens = docs.str.lower().str.split().explode()
vocab = all_tokens.value_counts()
print(f"Vocab size: {vocab.shape[0]}")
print("Top 20 words:\n", vocab.head(20))


count    131250.000000
mean        150.491749
std          54.413726
min          10.000000
25%         113.000000
50%         145.000000
75%         182.000000
max        1463.000000
Name: abstract, dtype: float64
count    131250.000000
mean       1024.288571
std         366.203104
min          76.000000
25%         772.000000
50%         989.000000
75%        1240.000000
max        9678.000000
Name: abstract, dtype: float64
Vocab size: 349687
Top 20 words:
 the     1199390
of       703989
a        527566
and      521693
to       476712
in       405335
is       291181
we       280084
for      241186
that     206492
on       187880
this     177000
with     149538
are      144287
by       127485
as       123955
an       115373
our       97439
from      94102
can       88650
Name: abstract, dtype: int64


In [None]:
import os
import shutil
import pandas as pd
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer

# 1. ΟΡΙΣΜΟΣ ΚΑΙ ΚΑΘΑΡΙΣΜΟΣ root folder για nltk_data
NLTK_ROOT = r'D:\NLP\nltk\nltk_data'
# Αν χρειαστεί, ξεσχολιάζεις για καθαρό download
# if os.path.isdir(NLTK_ROOT):
#     shutil.rmtree(NLTK_ROOT)
# os.makedirs(NLTK_ROOT, exist_ok=True)

# 2. Force to look only there
nltk.data.path.clear()
nltk.data.path.append(NLTK_ROOT)
print("NLTK will look in:", nltk.data.path)

# 3. (Όπως πριν) verify that punkt, wordnet, stopwords κ.λπ. έχουν κατέβει
assert os.path.isdir(os.path.join(NLTK_ROOT, 'tokenizers', 'punkt'))
assert os.path.isdir(os.path.join(NLTK_ROOT, 'corpora', 'wordnet'))
assert os.path.isdir(os.path.join(NLTK_ROOT, 'corpora', 'stopwords'))

# 4. Lemmatizer & stopwords
lemmatizer = WordNetLemmatizer()
stopwords = set(nltk.corpus.stopwords.words('english'))

def tokenize_lemmatize_nltk(text: str) -> list[str]:
    tokens = word_tokenize(text.lower())
    return [
        lemmatizer.lemmatize(tok)
        for tok in tokens
        if tok.isalpha() and tok not in stopwords
    ]

# 5. Δοκιμή tokenizer
print(word_tokenize("This is a quick test."))
print(tokenize_lemmatize_nltk("This is a quick test."))


docs = abstracts['abstract'].dropna().astype(str)

# 7. CountVectorizer μόνο για unigrams
vectorizer = CountVectorizer(
    tokenizer=tokenize_lemmatize_nltk,
    ngram_range=(1,1),
    min_df=2,
    max_df=0.4
)
X_counts = vectorizer.fit_transform(docs)
terms = vectorizer.get_feature_names_out()

# 8. Επισκόπηση vocabulary
print("Unigram vocab size:", len(terms))

# 9. Top-20 πιο συχνά unigrams (raw counts)
freq = np.asarray(X_counts.sum(axis=0)).ravel()
idx = np.argsort(freq)[::-1][:20]
top20 = terms[idx]
top20_counts = freq[idx]

print("Top 20 unigrams by count:")
for term, cnt in zip(top20, top20_counts):
    print(f"  {term:<20} {cnt}")


In [12]:
import os
import shutil
import pandas as pd
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

# 1. ΟΡΙΣΜΟΣ ΚΑΙ ΚΑΘΑΡΙΣΜΟΣ root folder για nltk_data
NLTK_ROOT = r'D:\NLP\nltk\nltk_data'
# Αν υπάρχει ήδη από παλιά, το διαγράφουμε για να βεβαιωθούμε σε καθαρό download
# if os.path.isdir(NLTK_ROOT):
#     shutil.rmtree(NLTK_ROOT)
# os.makedirs(NLTK_ROOT, exist_ok=True)

# 2. Force να ψάχνει μόνο εκεί
nltk.data.path.clear()
nltk.data.path.append(NLTK_ROOT)
print("NLTK will look in:", nltk.data.path)

# 3. Κατέβασμα όλων των απαραίτητων resources
# for pkg in ['punkt', 'punkt_tab', 'wordnet', 'omw-1.4', 'stopwords']:
#     nltk.download(pkg, download_dir=NLTK_ROOT)

# 4. Επαλήθευση ότι κατέβηκαν
assert os.path.isdir(os.path.join(NLTK_ROOT, 'tokenizers', 'punkt'))
assert os.path.isdir(os.path.join(NLTK_ROOT, 'tokenizers', 'punkt_tab', 'english'))
assert os.path.isdir(os.path.join(NLTK_ROOT, 'corpora', 'wordnet'))
assert os.path.isdir(os.path.join(NLTK_ROOT, 'corpora', 'omw-1.4'))
assert os.path.isdir(os.path.join(NLTK_ROOT, 'corpora', 'stopwords'))

# 5. Ρύθμιση lemmatizer & stopwords
lemmatizer = WordNetLemmatizer()
stopwords = set(nltk.corpus.stopwords.words('english'))

def tokenize_lemmatize_nltk(text: str) -> list[str]:
    tokens = word_tokenize(text.lower())
    return [
        lemmatizer.lemmatize(tok)
        for tok in tokens
        if tok.isalpha() and tok not in stopwords
    ]

# 6. Δοκιμή tokenizer
print(word_tokenize("This is a quick test."))
print(tokenize_lemmatize_nltk("This is a quick test."))

# 7. Φόρτωση abstracts
# abstracts = pd.read_csv(
#     r'C:\Users\mysmu\Desktop\Natural Language Processing\nlp-cse-uoi-2025\data_new\abstracts.txt',
#     sep=r'\|\-\-\|', engine='python',
#     header=None, names=['paper_id','abstract']
# )
docs = abstracts['abstract'].dropna().astype(str)

# 8. TF–IDF Vectorizer + fit
vectorizer = TfidfVectorizer(
    tokenizer=tokenize_lemmatize_nltk,
    min_df=2,
    max_df=0.4,
    ngram_range=(1,2),
    sublinear_tf=True
)
X_tfidf = vectorizer.fit_transform(docs)
print("Vocabulary size:", len(vectorizer.vocabulary_))

# 9. Top-20 συχνές λέξεις (χαμηλό IDF)
terms = vectorizer.get_feature_names_out()
idf   = vectorizer.idf_
idx   = np.argsort(idf)
top20 = terms[idx][:20]
print("Top 20 tokens:", top20)


NLTK will look in: ['D:\\NLP\\nltk\\nltk_data']
['This', 'is', 'a', 'quick', 'test', '.']
['quick', 'test']
Vocabulary size: 1100565
Top 20 tokens: ['show' 'algorithm' 'model' 'approach' 'data' 'proposed' 'problem' 'based'
 'using' 'propose' 'present' 'image' 'performance' 'two' 'also' 'new'
 'set' 'learning' 'used' 'information']


In [13]:
top20 = terms[idx][:1000]
print("Top 20 tokens:", top20)

Top 20 tokens: ['show' 'algorithm' 'model' 'approach' 'data' 'proposed' 'problem' 'based'
 'using' 'propose' 'present' 'image' 'performance' 'two' 'also' 'new'
 'set' 'learning' 'used' 'information' 'experiment' 'feature' 'system'
 'novel' 'different' 'one' 'network' 'demonstrate' 'experimental' 'task'
 'technique' 'use' 'application' 'however' 'first' 'analysis' 'work'
 'framework' 'time' 'number' 'experimental result' 'many' 'large'
 'function' 'existing' 'well' 'structure' 'study' 'several'
 'classification' 'accuracy' 'process' 'efficient' 'given'
 'representation' 'datasets' 'training' 'object' 'space' 'order'
 'multiple' 'real' 'important' 'compared' 'effectiveness' 'better'
 'paper present' 'provide' 'neural' 'paper propose' 'term' 'high'
 'parameter' 'solution' 'proposed method' 'local' 'machine' 'case' 'class'
 'point' 'recognition' 'detection' 'neural network' 'result show'
 'improve' 'example' 'linear' 'evaluation' 'address' 'three' 'domain'
 'effective' 'user' 'make' 'error

In [28]:
# Uni-grams: Check Single  Words dist
import os
import shutil
import pandas as pd
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer

# 1. ΟΡΙΣΜΟΣ ΚΑΙ ΚΑΘΑΡΙΣΜΟΣ root folder για nltk_data
NLTK_ROOT = r'D:\NLP\nltk\nltk_data'
# Αν χρειαστεί, ξεσχολιάζεις για καθαρό download
# if os.path.isdir(NLTK_ROOT):
#     shutil.rmtree(NLTK_ROOT)
# os.makedirs(NLTK_ROOT, exist_ok=True)

# 2. Force to look only there
nltk.data.path.clear()
nltk.data.path.append(NLTK_ROOT)
print("NLTK will look in:", nltk.data.path)

# 3. (Όπως πριν) verify that punkt, wordnet, stopwords κ.λπ. έχουν κατέβει
assert os.path.isdir(os.path.join(NLTK_ROOT, 'tokenizers', 'punkt'))
assert os.path.isdir(os.path.join(NLTK_ROOT, 'corpora', 'wordnet'))
assert os.path.isdir(os.path.join(NLTK_ROOT, 'corpora', 'stopwords'))

# 4. Lemmatizer & stopwords
lemmatizer = WordNetLemmatizer()
stopwords = set(nltk.corpus.stopwords.words('english'))

def tokenize_lemmatize_nltk(text: str) -> list[str]:
    tokens = word_tokenize(text.lower())
    return [
        lemmatizer.lemmatize(tok)
        for tok in tokens
        if tok.isalpha() and tok not in stopwords
    ]

# 5. Δοκιμή tokenizer
print(word_tokenize("This is a quick test."))
print(tokenize_lemmatize_nltk("This is a quick test."))

# 6. Φόρτωση abstracts
# abstracts = pd.read_csv(
#     r'C:\Users\mysmu\Desktop\…\abstracts.txt',
#     sep=r'\|\-\-\|', engine='python',
#     header=None, names=['paper_id','abstract']
# )
docs = abstracts['abstract'].dropna().astype(str)

# 7. CountVectorizer μόνο για unigrams
vectorizer = CountVectorizer(
    tokenizer=tokenize_lemmatize_nltk,
    ngram_range=(1,1),
    min_df=2,
    max_df=0.9
)
X_counts = vectorizer.fit_transform(docs)
terms = vectorizer.get_feature_names_out()

# 8. Επισκόπηση vocabulary
print("Unigram vocab size:", len(terms))

# 9. Top-20 πιο συχνά unigrams (raw counts)
freq = np.asarray(X_counts.sum(axis=0)).ravel()
idx = np.argsort(freq)[::-1][:20]
top20 = terms[idx]
top20_counts = freq[idx]

print("Top 20 unigrams by count:")
for term, cnt in zip(top20, top20_counts):
    print(f"  {term:<20} {cnt}")


NLTK will look in: ['D:\\NLP\\nltk\\nltk_data']
['This', 'is', 'a', 'quick', 'test', '.']
['quick', 'test']
Unigram vocab size: 38891
Top 20 unigrams by count:
  method               131214
  model                116284
  data                 108067
  algorithm            104135
  image                104101
  paper                82270
  approach             79351
  problem              78550
  result               76744
  proposed             73797
  feature              66931
  based                62904
  learning             61874
  using                61691
  show                 61262
  network              59537
  system               53916
  set                  48399
  propose              47718
  information          47252


In [25]:
# 8. Επισκόπηση vocabulary
print("Unigram vocab size:", len(terms))

# 9. Top-20 πιο συχνά unigrams (raw counts)
freq = np.asarray(X_counts.sum(axis=0)).ravel()
idx = np.argsort(freq)[::-1]
top20 = terms[idx]
top20_counts = freq[idx]

print("Top 20 unigrams by count:")
for term, cnt in zip(top20, top20_counts):
    print(f"  {term:<20} {cnt}")


Unigram vocab size: 38863
Top 20 unigrams by count:
  network              59537
  object               36136
  task                 35575
  time                 33490
  technique            33170
  one                  31541
  function             31533
  framework            31367
  classification       31213
  analysis             30128
  application          28772
  use                  27969
  structure            27093
  number               26557
  representation       26144
  demonstrate          26120
  work                 25833
  user                 25495
  experimental         24916
  video                24888
  first                24862
  large                24304
  training             24272
  however              24162
  detection            23372
  graph                23316
  space                23152
  process              22806
  recognition          22664
  query                22558
  many                 22475
  neural               22312
  existing          

In [41]:
# Bi-grams: Check two sequencial words/phrase dist
import os
import shutil
import pandas as pd
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer

# 1. ΟΡΙΣΜΟΣ root folder για nltk_data (όπως πριν)
NLTK_ROOT = r'D:\NLP\nltk\nltk_data'
nltk.data.path.clear()
nltk.data.path.append(NLTK_ROOT)

# 2. Lemmatizer & stopwords
lemmatizer = WordNetLemmatizer()
stopwords = set(nltk.corpus.stopwords.words('english'))

def tokenize_lemmatize_nltk(text: str) -> list[str]:
    tokens = word_tokenize(text.lower())
    return [
        lemmatizer.lemmatize(tok)
        for tok in tokens
        if tok.isalpha() and tok not in stopwords
    ]

# 3. Φόρτωση abstracts
# abstracts = pd.read_csv(
#     r'C:\path\to\abstracts.txt',
#     sep=r'\|\-\-\|', engine='python',
#     header=None, names=['paper_id','abstract']
# )
# docs = abstracts['abstract'].dropna().astype(str)
# Εδώ υποθέτουμε `docs` έτοιμο ως pd.Series ή list[str].

# 4. Fit CountVectorizer μόνο για bigrams
min_df_bi = 20    # π.χ. κρατάμε bigrams που εμφανίζονται σε ≥5 docs
max_df_bi = 0.7  # αφαιρούμε bigrams που εμφανίζονται σε >40% των docs

cv_bi = CountVectorizer(
    tokenizer=tokenize_lemmatize_nltk,
    ngram_range=(2,2),
    min_df=min_df_bi,
    max_df=max_df_bi
)
X_bi = cv_bi.fit_transform(docs)

# 5. Vocabulary & μέγεθος
bi_terms = cv_bi.get_feature_names_out()
print(f"Bigram vocab size (min_df={min_df_bi}, max_df={max_df_bi}): {len(bi_terms)}")

# 6. Top-20 bigrams by raw count
freq_bi = np.asarray(X_bi.sum(axis=0)).ravel()
idx_top_bi = np.argsort(freq_bi)[::-1][:1000]
top20_bigrams = bi_terms[idx_top_bi]
top20_counts  = freq_bi[idx_top_bi]

print("\nTop-20 bigrams by count:")
for phrase, cnt in zip(top20_bigrams, top20_counts):
    print(f"  {phrase:<30} {cnt}")


Bigram vocab size (min_df=20, max_df=0.7): 65940

Top-20 bigrams by count:
  experimental result            19838
  neural network                 18374
  proposed method                15552
  paper present                  13765
  paper propose                  13451
  data set                       12945
  result show                    12028
  propose novel                  7731
  machine learning               6599
  proposed approach              6471
  show proposed                  6281
  proposed algorithm             6258
  learning algorithm             5785
  propose new                    5108
  social network                 4869
  training data                  4727
  support vector                 4719
  paper proposes                 4677
  computer vision                4671
  extensive experiment           4471
  effectiveness proposed         4440
  present novel                  4414
  demonstrate effectiveness      4306
  result demonstrate             4236
  expe

In [42]:
# 5. Vocabulary & μέγεθος
bi_terms = cv_bi.get_feature_names_out()
print(f"Bigram vocab size (min_df={min_df_bi}, max_df={max_df_bi}): {len(bi_terms)}")

# 6. Top-20 bigrams by raw count
freq_bi = np.asarray(X_bi.sum(axis=0)).ravel()
idx_top_bi = np.argsort(freq_bi)[::-1][:1000]
top20_bigrams = bi_terms[idx_top_bi]
top20_counts  = freq_bi[idx_top_bi]

print("\nTop-20 bigrams by count:")
for phrase, cnt in zip(top20_bigrams, top20_counts):
    print(f"  {phrase:<30} {cnt}")


Bigram vocab size (min_df=20, max_df=0.7): 65940

Top-20 bigrams by count:
  experimental result            19838
  neural network                 18374
  proposed method                15552
  paper present                  13765
  paper propose                  13451
  data set                       12945
  result show                    12028
  propose novel                  7731
  machine learning               6599
  proposed approach              6471
  show proposed                  6281
  proposed algorithm             6258
  learning algorithm             5785
  propose new                    5108
  social network                 4869
  training data                  4727
  support vector                 4719
  paper proposes                 4677
  computer vision                4671
  extensive experiment           4471
  effectiveness proposed         4440
  present novel                  4414
  demonstrate effectiveness      4306
  result demonstrate             4236
  expe

In [43]:
# … μετά το μπλοκ που έχεις ήδη υπολογίσει `X_bi`, `bi_terms` και `freq_bi`:

# 1. Βρίσκουμε τους 5 000 πιο σπάνιους δείκτες (χαμηλότερη count)
idx_rare_bi = np.argsort(freq_bi)
rare5000_bigrams = bi_terms[idx_rare_bi]
rare5000_counts  = freq_bi[idx_rare_bi]

# 2. Εκτύπωση σε μορφή “phrase   count”
print("\n5000 rarest bigrams by count:")
for phrase, cnt in zip(rare5000_bigrams, rare5000_counts):
    print(f"  {phrase:<30} {cnt}")




5000 rarest bigrams by count:
  since information              20
  engineering paper              20
  kind constraint                20
  via image                      20
  available application          20
  class impulsive                20
  function considers             20
  tracking still                 20
  via latent                     20
  enhance effectiveness          20
  relationship one               20
  enhance prediction             20
  svm also                       20
  class far                      20
  svm experiment                 20
  class even                     20
  enhancement proposed           20
  via em                         20
  enhancement result             20
  implemented framework          20
  class markovian                20
  employed identify              20
  function energy                20
  function empirical             20
  class respectively             20
  enables better                 20
  enables joint                  

In [44]:
## Save total corpus for unigram/bigram

In [45]:
# unigrams
min_df_uni = 2 
max_df_uni = 0.8 

cv_uni = CountVectorizer(tokenizer=tokenize_lemmatize_nltk,
                         ngram_range=(1,1),
                         min_df=min_df_uni,
                         max_df=max_df_uni)
cv_uni.fit(docs)
uni_vocab = cv_uni.get_feature_names_out()

# bigrams
min_df_bi = 20
max_df_bi = 0.7

cv_bi = CountVectorizer(tokenizer=tokenize_lemmatize_nltk,
                        ngram_range=(2,2),
                        min_df=min_df_bi,
                        max_df=max_df_bi)
cv_bi.fit(docs)
bi_vocab = cv_bi.get_feature_names_out()


In [47]:
# combine and save the vocabulary 
import json

combined_vocab = {t:i for i,t in enumerate(uni_vocab)}
offset = len(uni_vocab)
for j,t in enumerate(bi_vocab):
    combined_vocab[t] = offset + j

    import json
with open('D:/NLP/citation_link_prediction/corpus_uni_bi_grams_filtered.json','w',encoding='utf-8') as f:
    json.dump(combined_vocab, f, ensure_ascii=False, indent=2)



In [48]:
# compute tfidf scores based on the new corpus vocabulary 

import json

with open('D:/NLP/citation_link_prediction/corpus_uni_bi_grams_filtered.json','r',encoding='utf-8') as f:
    combined_vocab = json.load(f)

# Fit tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    tokenizer=tokenize_lemmatize_nltk,
    vocabulary=combined_vocab,
    sublinear_tf=True,
    norm='l2'
)


In [51]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# 1. Φόρτωση abstracts
abstracts = pd.read_csv(
    r'C:\Users\mysmu\Desktop\Natural Language Processing\nlp-cse-uoi-2025\data_new\abstracts.txt',
    sep=r'\|\-\-\|',
    engine='python',
    header=None,
    names=['paper','abstract']
)

# 2. Χειρισμός κενών abstracts: αντικατάσταση NaN με κενές συμβολοσειρές
abstracts['abstract'] = abstracts['abstract'].fillna('')

# 3. Προετοιμασία custom tokenizer (όπως πριν)
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

lemmatizer = WordNetLemmatizer()
stopwords = set(nltk.corpus.stopwords.words('english'))

def tokenize_lemmatize_nltk(text: str) -> list[str]:
    tokens = word_tokenize(text.lower())
    return [
        lemmatizer.lemmatize(tok)
        for tok in tokens
        if tok.isalpha() and tok not in stopwords
    ]

# 4. Φτιάχνουμε και τρέχουμε τον TfidfVectorizer
tfidf = TfidfVectorizer(
    tokenizer=tokenize_lemmatize_nltk,
    vocabulary=combined_vocab,
    sublinear_tf=True,
    norm='l2'
)

# fit_transform στο σύνολο abstracts
X_tfidf = tfidf.fit_transform( abstracts['abstract'] )

# 5. Έξοδος: X_tfidf είναι μια sparse μήτρα με ένα row ανά abstract
print("TF–IDF matrix shape:", X_tfidf.shape)

# 6. Παράδειγμα: vector για το πρώτο abstract
first_vec = X_tfidf[0]         # sparse row
print(first_vec)

# 7. (Προαιρετικό) Αν θέλεις pandas DataFrame για λίγες πρώτες εγγραφές:
#   df_vec = pd.DataFrame(X_tfidf[:5].toarray(),
#                         columns=vectorizer.get_feature_names_out())
#   print(df_vec)






TF–IDF matrix shape: (138499, 104831)
  (0, 184)	0.1302125706481994
  (0, 265)	0.09409089286143174
  (0, 305)	0.09485930007696194
  (0, 695)	0.476775251404947
  (0, 768)	0.14249103358814627
  (0, 948)	0.0790359251380271
  (0, 1508)	0.04741254257916617
  (0, 1816)	0.1428868303331532
  (0, 1911)	0.09394354568159355
  (0, 1935)	0.07542455075337563
  (0, 2221)	0.08868474895485955
  (0, 3692)	0.11929418950838488
  (0, 4010)	0.10206007464476624
  (0, 4051)	0.11923890643788443
  (0, 5430)	0.1997973339639439
  (0, 5795)	0.055930769137243666
  (0, 5889)	0.09387596719586801
  (0, 7514)	0.0736628762774124
  (0, 8131)	0.0460990085348702
  (0, 8330)	0.09423952693249212
  (0, 8437)	0.08907869373942813
  (0, 8475)	0.07991850969389314
  (0, 10142)	0.05610455894992136
  (0, 10808)	0.09271380685546378
  (0, 11163)	0.06406719318216832
  :	:
  (0, 19757)	0.19110385168225627
  (0, 20222)	0.0820347430820776
  (0, 20871)	0.08301540872386294
  (0, 20879)	0.07182350172549834
  (0, 23583)	0.09066853747107027
  

In [52]:
# SVD for Dim reduction (104k -> 300 d)

from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline

# Θέλουμε π.χ. 300 διαστάσεις
n_components = 300
svd = TruncatedSVD(n_components=n_components, random_state=42)
normalizer = Normalizer(copy=False)           # l2-normalize μετά το SVD
lsa_pipeline = make_pipeline(svd, normalizer)

# Χτίζουμε X_lsa: shape = (n_docs, 300)
X_lsa = lsa_pipeline.fit_transform(X_tfidf)
print("LSA matrix shape:", X_lsa.shape)


LSA matrix shape: (138499, 300)


In [53]:
import numpy as np

# Αποθήκευση σε .npy
np.save('D:/NLP/citation_link_prediction/abstracts_embeds.npy', X_lsa)