In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [45]:
df = pd.read_csv('papers.csv')

In [5]:
df.head()

Unnamed: 0,id,year,title,event_type,pdf_name,abstract,paper_text
0,1,1987,Self-Organization of Associative Database and ...,,1-self-organization-of-associative-database-an...,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...
1,10,1987,A Mean Field Theory of Layer IV of Visual Cort...,,10-a-mean-field-theory-of-layer-iv-of-visual-c...,Abstract Missing,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...
2,100,1988,Storing Covariance by the Associative Long-Ter...,,100-storing-covariance-by-the-associative-long...,Abstract Missing,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...
3,1000,1994,Bayesian Query Construction for Neural Network...,,1000-bayesian-query-construction-for-neural-ne...,Abstract Missing,Bayesian Query Construction for Neural\nNetwor...
4,1001,1994,"Neural Network Ensembles, Cross Validation, an...",,1001-neural-network-ensembles-cross-validation...,Abstract Missing,"Neural Network Ensembles, Cross\nValidation, a..."


In [6]:
df.shape

(7241, 7)

In [46]:
df = df.iloc[:5000,:]

In [47]:
df.shape

(5000, 7)

## PREPROCESSING

#### Steps to do
1. Lower case
2. remove HTML tags
3. remove special characters and digits
4. Convert to list from string
5. remove stopwords
6. remove words less than three letters
7. lemmatizematize

In [48]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [49]:
stop_words = set(stopwords.words('english'))

In [50]:
#creating new stopwords
new_stop_words = ["fig","figure","image","sample","using", 
             "show", "result", "large", 
             "also", "one", "two", "three", 
             "four", "five", "seven","eight","nine"]

In [51]:
stop_words = list(stop_words.union(new_stop_words))

In [16]:
#len(stop_words)

215

In [44]:
from nltk.stem.wordnet import WordNetLemmatizer

In [54]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...


True

In [55]:
def preprocessing(txt):
    # Lower case
    txt = txt.lower()
    # Remove HTML tags
    txt = re.sub(r"<.*?>", " ", txt)
    # Remove special characters and digits
    txt = re.sub(r"[^a-zA-Z]", " ", txt)
    # tokenization
    txt = nltk.word_tokenize(txt)
    # Remove stopwords
    txt = [word for word in txt if word not in stop_words]
    # Remove words less than three letters
    txt = [word for word in txt if len(word) >= 3]
    # Stemming
    # stemming = PorterStemmer()
    # txt = [stemming.stem(word) for word in txt]
    # Lemmatize
    lmtr = WordNetLemmatizer()
    txt = [lmtr.lemmatize(word) for word in txt]

    return " ".join(txt)

In [22]:
#preprocessing(df.loc[0,'paper_text'])

In [56]:
docs = df['paper_text'].apply(lambda x:preprocessing(x))

In [57]:
docs.shape

(5000,)

In [58]:
docs

0       self organization associative database applica...
1       mean field theory layer visual cortex applicat...
2       storing covariance associative long term poten...
3       bayesian query construction neural network mod...
4       neural network ensemble cross validation activ...
                              ...                        
4995    low rank time frequency synthesis matthieu kow...
4996    state space model decoding auditory attentiona...
4997    efficient structured matrix rank minimization ...
4998    cient minimax signal detection graph jing qian...
4999    signal aggregate constraint additive factorial...
Name: paper_text, Length: 5000, dtype: object

#### Count Vectorizer

In [59]:
#To count word count/frequencies of different words
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_df=0.95, max_features=5000, ngram_range = (1,3))
word_count_vectors = cv.fit_transform(docs)

#### Tf-Idf Transformer

In [62]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer = tfidf_transformer.fit(word_count_vectors)

In [63]:
tfidf_transformer

## EXTRATING KEYWORDS

In [64]:
len(cv.get_feature_names_out())

5000

In [65]:
feature_names = cv.get_feature_names_out()

#The COO (Coordinate Format) sparse matrix stores non-zero elements as triplets: a list of row indices, a list of corresponding column indices, and a list of the actual data

#The .tocoo() method converts an existing sparse matrix (such as those in CSR or CSC format) into the COOrdinate (COO) format. 

In [66]:
## getting keywords out of any given text

def get_keywords(idx,docs,topN=10):

    #getting word count and importance
    doc_words_count = tfidf_transformer.transform(cv.transform([docs[idx]]))

    #sorting sparse matrix
    doc_words_count = doc_words_count.tocoo()
    tuples = zip(doc_words_count.col,doc_words_count.data)
    sorted_items = sorted(tuples,key=lambda x:(x[1],x[0]),reverse=True)

    #getting top 10 words
    sorted_items = sorted_items[:topN]
    score_vals = []
    feature_vals = []
    for idx,score in sorted_items:
        score_vals.append(round(score,3))
        feature_vals.append(feature_names[idx])

    #final result
    results = {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]] = score_vals[idx]
    return results

In [67]:
def print_results(idx,keywords, df):
    # now print the results
    print("\n=====Title=====")
    print(df['title'][idx])
    print("\n=====Abstract=====")
    print(df['abstract'][idx])
    print("\n===Keywords===")
    for k in keywords:
        print(k,keywords[k])

In [68]:
idx=4995
keywords=get_keywords(idx, docs)
print_results(idx,keywords, df)


=====Title=====
Low-Rank Time-Frequency Synthesis

=====Abstract=====
Many single-channel signal decomposition techniques rely on a low-rank factorization of a time-frequency transform. In particular, nonnegative matrix factorization (NMF) of the spectrogram -- the (power) magnitude of the short-time Fourier transform (STFT) -- has been considered in many audio applications. In this setting, NMF with the Itakura-Saito divergence was shown to underly a generative Gaussian composite model (GCM) of the STFT, a step forward from more empirical approaches based on ad-hoc transform and divergence specifications. Still, the GCM is not yet a generative model of the raw signal itself, but only of its STFT. The work presented in this paper fills in this ultimate gap by proposing a novel signal synthesis model with low-rank time-frequency structure. In particular, our new approach opens doors to multi-resolution representations, that were not possible in the traditional NMF setting. We describe 

In [69]:
#DUMP NECESSARY FILES
import pickle
pickle.dump(tfidf_transformer,open('tfidf_transformer.pkl','wb'))
pickle.dump(cv,open('count_vectorizer.pkl','wb'))
pickle.dump(feature_names,open('feature_names.pkl','wb'))