In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('/content/sample_data/papers.csv')

In [None]:
df.head()
df

Unnamed: 0,id,year,title,event_type,pdf_name,abstract,paper_text
0,1,1987,Self-Organization of Associative Database and ...,,1-self-organization-of-associative-database-an...,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...
1,10,1987,A Mean Field Theory of Layer IV of Visual Cort...,,10-a-mean-field-theory-of-layer-iv-of-visual-c...,Abstract Missing,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...
2,100,1988,Storing Covariance by the Associative Long-Ter...,,100-storing-covariance-by-the-associative-long...,Abstract Missing,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...
3,1000,1994,Bayesian Query Construction for Neural Network...,,1000-bayesian-query-construction-for-neural-ne...,Abstract Missing,Bayesian Query Construction for Neural\nNetwor...
4,1001,1994,"Neural Network Ensembles, Cross Validation, an...",,1001-neural-network-ensembles-cross-validation...,Abstract Missing,"Neural Network Ensembles, Cross\nValidation, a..."
...,...,...,...,...,...,...,...
7236,994,1994,Single Transistor Learning Synapses,,994-single-transistor-learning-synapses.pdf,Abstract Missing,Single Transistor Learning Synapses\n\nPaul Ha...
7237,996,1994,"Bias, Variance and the Combination of Least Sq...",,996-bias-variance-and-the-combination-of-least...,Abstract Missing,"Bias, Variance and the Combination of\nLeast S..."
7238,997,1994,A Real Time Clustering CMOS Neural Engine,,997-a-real-time-clustering-cmos-neural-engine.pdf,Abstract Missing,A Real Time Clustering CMOS\nNeural Engine\nT....
7239,998,1994,Learning direction in global motion: two class...,,998-learning-direction-in-global-motion-two-cl...,Abstract Missing,Learning direction in global motion: two\nclas...


In [None]:
df.shape

(7241, 7)

In [None]:
df = df.iloc[:5000,:]

In [None]:
df.shape

(5000, 7)

In [None]:
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
stop_words = set(stopwords.words('english'))

Creating a list of custom stop words

In [None]:
new_words = ["fig","figure","image","sample","using","show","result","large","also","one","two","three","four","five","seven","eight","nine"]

In [None]:
stop_words = list(stop_words.union(new_words))

In [None]:
def preprocess_text(text):
  text = text.lower()
  text = re.sub(r'<.*?>',' ',text)
  text = re.sub(r'[^a-zA-Z]',' ',text)
  text = nltk.word_tokenize(text)
  text = [word for word in text if word not in stop_words]
  text = [word for word in text if len(word) >= 3]
  stemming = PorterStemmer()
  text = [stemming.stem(word) for word in text]
  return ' '.join(text)

In [None]:
preprocess_text("tHiS iS to learning and understanding 4545 %$# PyThoN <h1><p> web language</p></h1>")

'learn understand python web languag'

In [None]:
docs = df['paper_text'].apply(lambda x: preprocess_text(x))

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_df = 0.95, max_features = 5000, ngram_range = (1,3))
word_count_vector = cv.fit_transform(docs)

In [24]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidfTransformer = TfidfTransformer(use_idf=True,smooth_idf=True)
tfidfTransformer.fit(word_count_vector)

In [28]:
feature_names = cv.get_feature_names_out()

In [30]:
def get_keywords(idx,docs):
  docs_word_count = tfidfTransformer.transform(cv.transform([docs[idx]]))
  #build sparse matrix
  docs_word_count = docs_word_count.tocoo()
  tuples = zip(docs_word_count.col,docs_word_count.data)
  sorted_items = sorted(tuples,key = lambda x: (x[1],x[0]),reverse=True)

  sorted_items = sorted_items[:10]
  score_vals = []
  feature_vals = []
  for idx, score in sorted_items:
    score_vals.append(round(score,3))
    feature_vals.append(feature_names[idx])

  results = {}
  for idx in range(len(feature_vals)):
    results[feature_vals[idx]] = score_vals[idx]
  return results

def print_keywords(idx,keywords,idf):
  print("\n--------Title-------")
  print(df['title'][idx])
  print("\n--------Abstract-------")
  print(df['abstract'][idx])
  print("\n--------Keywords-------")
  for k in keywords:
    print(k,keywords[k])

idx = 200
keywords = get_keywords(idx,docs)
print_keywords(idx,keywords,docs)



--------Title-------
Balancing Between Bagging and Bumping

--------Abstract-------
Abstract Missing

--------Keywords-------
gener error 0.402
bump 0.395
bootstrap 0.339
network 0.26
bag 0.25
error 0.241
balanc 0.201
valid 0.187
valid set 0.151
train 0.13


In [None]:
#creating the pickle files
import pickle
pickle.dump(tfidfTransformer,open('tfidfTransformer.pkl'),'wb')
pickle.dump(cv,open('count_vector.pkl'),'wb')
pickle.dump(feature_names,open('feature_names.pkl'),'wb')