In [15]:
import pandas as pd

In [16]:
df = pd.read_csv('papers.csv')

In [17]:
df.head()

Unnamed: 0,id,year,title,event_type,pdf_name,abstract,paper_text
0,1,1987,Self-Organization of Associative Database and ...,,1-self-organization-of-associative-database-an...,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...
1,10,1987,A Mean Field Theory of Layer IV of Visual Cort...,,10-a-mean-field-theory-of-layer-iv-of-visual-c...,Abstract Missing,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...
2,100,1988,Storing Covariance by the Associative Long-Ter...,,100-storing-covariance-by-the-associative-long...,Abstract Missing,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...
3,1000,1994,Bayesian Query Construction for Neural Network...,,1000-bayesian-query-construction-for-neural-ne...,Abstract Missing,Bayesian Query Construction for Neural\nNetwor...
4,1001,1994,"Neural Network Ensembles, Cross Validation, an...",,1001-neural-network-ensembles-cross-validation...,Abstract Missing,"Neural Network Ensembles, Cross\nValidation, a..."


In [18]:
df.shape

(7241, 7)

In [19]:
df = df.iloc[:5000,:]

In [20]:
df.shape

(5000, 7)

In [21]:
df['paper_text'][0]

'767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABASE\nAND ITS APPLICATIONS\nHisashi Suzuki and Suguru Arimoto\nOsaka University, Toyonaka, Osaka 560, Japan\nABSTRACT\nAn efficient method of self-organizing associative databases is proposed together with\napplications to robot eyesight systems. The proposed databases can associate any input\nwith some output. In the first half part of discussion, an algorithm of self-organization is\nproposed. From an aspect of hardware, it produces a new style of neural network. In the\nlatter half part, an applicability to handwritten letter recognition and that to an autonomous\nmobile robot system are demonstrated.\n\nINTRODUCTION\nLet a mapping f : X -+ Y be given. Here, X is a finite or infinite set, and Y is another\nfinite or infinite set. A learning machine observes any set of pairs (x, y) sampled randomly\nfrom X x Y. (X x Y means the Cartesian product of X and Y.) And, it computes some\nestimate j : X -+ Y of f to make small, the estimation erro

In [None]:
# steps to do
# Lower case
# Remove HTML tags
# Remove special characters ad digits
# tokenization
# remove stopwords
# remove words less then three letters
# lemmatize

In [22]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [27]:
## creating a list of custom stopwords
stop_words = set(stopwords.words('english'))
new_words = ["fig","figure","image","sample","using","show","result","large","also","one","two","three","four","five","seven","eight","nine"]
stop_words = list((stop_words.union(new_words)))

In [44]:
def preprocessing_text(txt):
    txt = txt.lower() 
    # remove HTML tags
    txt = re.sub(r"<.*?>", " ", txt)
    # remove special characters
    txt = re.sub(r'[^a-zA-Z]',' ',txt)
    # tokenization
    txt = nltk.word_tokenize(txt)
    txt = [word for word in txt if word not in stop_words]
    txt = [word for word in txt if len(word)> 3]
    stemming = PorterStemmer()
    txt = [stemming.stem(word) for word in txt]
        
    return ' '.join(txt)

In [45]:
preprocessing_text('This is  78909 %#$^ Python <h1> <p>hello world</p> </h1>')

'python hello world'

In [47]:
docs = df['paper_text'].apply(lambda x:preprocessing_text(x))

In [48]:
docs[0]

'self organ associ databas applic hisashi suzuki suguru arimoto osaka univers toyonaka osaka japan abstract effici method self organ associ databas propos togeth applic robot eyesight system propos databas associ input output first half part discuss algorithm self organ propos aspect hardwar produc style neural network latter half part applic handwritten letter recognit autonom mobil robot system demonstr introduct map given finit infinit anoth finit infinit learn machin observ pair sampl randomli mean cartesian product comput estim make small estim error measur usual faster decreas estim error increas number sampl better learn machin howev express perform incomplet sinc lack consider candid assum preliminarili find good learn machin clarifi concept discuss type learn machin advanc understand self organ associ databas paramet type ordinari type learn machin assum equat relat paramet indefinit name structur equival defin implicitli candid subset map comput valu paramet base observ sampl

In [49]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_df=0.95, max_features=5000,ngram_range=(1,3))
word_count_vectors = cv.fit_transform(docs)

In [None]:
# The tf-idf transformation is a widely used technique in NLP to represent the importance of terms in a document collection. It combines the term frequency (tf), which measures how often a term apprears in a document
# smooth_idf = True: This smoothing prevents division by zero whena term is not present in any document
# use_idf = True: give highest value to less occuring words(keywords)

In [50]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer(smooth_idf=True,use_idf=True)
tdidf_transformer = tfidf_transformer.fit(word_count_vectors)

In [51]:
feature_names = cv.get_feature_names_out()

In [56]:
def get_keywords(idx,docs, topN=10):
    # getting words count and importance
    docs_words_count = tfidf_transformer.transform(cv.transform([docs[idx]]))
    
    # sorting sparse matrix
    docs_words_count = docs_words_count.tocoo()
    tuples = zip(docs_words_count.col,docs_words_count.data)
    sorted_items = sorted(tuples,key=lambda x: (x[1],x[0]),reverse=True)
    
    # getting top 10 keywords
    sorted_items = sorted_items[:topN]
    score_vals = []
    feature_vals=[]
    for idx,score in sorted_items:
        score_vals.append(round(score,3))
        feature_vals.append(feature_names[idx])
        
    # final results
    results = {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]] = score_vals[idx]
    return results

    # keyword extraction
def print_keywords(idx,keywords,df):
    print("\n=======title========")
    print(df['title'][idx])
    print('\n=======abstract=====')
    print(df['abstract'][idx])
    print('\n=======keywords=====')
    for k in keywords:
        print(k,keywords[k])
        
idx = 1
keywords = get_keywords(idx,docs)
print_keywords(idx,keywords,df)                                                                                                                                     


A Mean Field Theory of Layer IV of Visual Cortex and Its Application to Artificial Neural Networks

Abstract Missing

cell 0.492
cortic 0.391
synaps 0.366
mean field 0.253
network 0.214
mean field approxim 0.165
field approxim 0.162
activ 0.144
field 0.129
layer 0.117


In [58]:
import pickle
pickle.dump(cv,open('count_vectorizer.pkl','wb'))
pickle.dump(tfidf_transformer,open('tfidf_transformer.pkl','wb'))
pickle.dump(feature_names,open('feature_names.pkl','wb'))