In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rajkhera/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/rajkhera/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
books_df = pd.read_csv("goodreads_books.csv", usecols=['Id', 'Name', 'Authors', 'ISBN', 'PublishYear', 'Publisher', 'Language', 'Description'])

In [4]:
books_df.shape

(39705, 8)

In [5]:
books_df.head()

Unnamed: 0,Id,Name,Authors,ISBN,PublishYear,Publisher,Language,Description
0,1000000,Flight from Eden,Kathryn A. Graham,0595199402,2001,Writer's Showcase Press,,"What could a computer expert, a mercenary with..."
1,1000001,Roommates Again,Kathryn O. Galbraith,0689505973,1994,Margaret K. McElderry Books,,"During their stay at Camp Sleep-Away, sisters ..."
2,1000003,The King At The Door,Brock Cole,0374440417,1992,Farrar Straus Giroux,,A poorly dressed old man appears at an inn and...
3,1000004,"Giotto: The Scrovegni Chapel, Padua",Bruce Cole,080761310X,1993,George Braziller,,This beautiful series lavishly illustrates the...
4,1000005,Larky Mavis,Brock Cole,0374343659,2001,"Farrar, Straus and Giroux (BYR)",,<b>Another orginal picture-book fairy tale</b>...


In [6]:
# Remove all rows where description is NA

books_cleaned_df = books_df.dropna(subset=['Description'])

In [7]:
sum(books_cleaned_df['Description'].isna())
books_cleaned_df.shape

(34559, 8)

In [8]:
import yake

In [9]:
doc1 = """
        This study presents a comparison of different deep learning methods used for sentiment analysis 
        in Twitter data. In this domain, deep learning (DL) techniques, which contribute at the same time 
        to the solution of a wide range of problems, gained popularity among researchers. Particularly, two 
        categories of neural networks are utilized, convolutional neural networks(CNN), which are especially 
        performant in the area of image processing and recurrent neural networks (RNN) which are applied with 
        success in natural language processing (NLP) tasks. In this work we evaluate and compare ensembles 
        and combinations of CNN and a category of RNN the long short-term memory (LSTM) networks. Additionally,
        we compare different word embedding systems such as the Word2Vec and the global vectors for word 
        representation (GloVe) models. For the evaluation of those methods we used data provided by the international 
        workshop on semantic evaluation (SemEval), which is one of the most popular international workshops 
        on the area. Various tests and combinations are applied and best scoring values for each model are 
        compared in terms of their performance. This study contributes to the field of sentiment analysis by 
        analyzing the performances, advantages and limitations of the above methods with an evaluation procedure 
        under a single testing framework with the same dataset and computing environment.
      """

In [10]:
doc2 = """
        The history of the development of statistical hypothesis testing in time series analysis is reviewed briefly 
        and it is pointed out that the hypothesis testing procedure is not adequately defined as the procedure for 
        statistical model identification. The classical maximum likelihood estimation procedure is reviewed and a 
        new estimate minimum information theoretical criterion (AIC) estimate (MAICE) which is designed for the purpose 
        of statistical identification is introduced. When there are several competing models the MAICE is defined 
        by the model and the maximum likelihood estimates of the parameters which give the minimum of AIC defined 
        by AIC = (-2)log-(maximum likelihood) + 2(number of independently adjusted parameters within the model). MAICE 
        provides a versatile procedure for statistical model identification which is free from the ambiguities inherent 
        in the application of conventional hypothesis testing procedure. The practical utility of MAICE in time 
        series analysis is demonstrated with some numerical examples.
       """

In [11]:
stop_words = set(stopwords.words('english'))

In [12]:
def remove_stop_words_from_keywords(keywords, stop_words):
    filtered_keywords = []
    for keyword in keywords:
        # Check if any word in the keyword phrase is a stop word
        if not any(word.lower() in stop_words for word in keyword[0].split()):
            filtered_keywords.append(keyword)
    return filtered_keywords

In [13]:
language = "en"
max_ngram_size = 3
deduplication_threshold = 0.9
deduplication_algo = 'seqm'
windowSize = 1
numOfKeywords = 10

In [14]:
custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, dedupFunc=deduplication_algo, windowsSize=windowSize, top=numOfKeywords, features=None)

In [15]:
keywords = custom_kw_extractor.extract_keywords(doc1)
filtered_keywords = remove_stop_words_from_keywords(keywords, stop_words)
for kw in filtered_keywords:
	print(kw)

('deep learning', 0.022316030222952542)
('deep learning methods', 0.023033274943689574)
('Twitter data', 0.027097682190152975)
('neural networks', 0.036406670620403876)
('Twitter', 0.06453868188482255)
('convolutional neural networks', 0.06465373368643157)
('recurrent neural networks', 0.07055699871496693)
('study presents', 0.07282416883388629)


In [16]:
keywords = custom_kw_extractor.extract_keywords(doc2)
filtered_keywords = remove_stop_words_from_keywords(keywords, stop_words)
for kw in filtered_keywords:
	print(kw)

('statistical model identification', 0.010789442479580198)
('hypothesis testing procedure', 0.012530368145606251)
('statistical hypothesis testing', 0.02540919367201189)
('time series analysis', 0.032893935889892506)
('hypothesis testing', 0.03340942932098045)
('testing procedure', 0.034824999228042636)
('maximum likelihood', 0.03631424036749724)
('statistical model', 0.03662978375322814)
('likelihood estimation procedure', 0.04035102042239439)
('model identification', 0.042820350337371825)


In [17]:
import torch

In [None]:
from transformers import BertTokenizer, BertModel

In [18]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def get_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Example usage
embeddings = [get_embeddings(abstract) for abstract in doc1]

  from .autonotebook import tqdm as notebook_tqdm


In [25]:
deep_learn_image_detect = "Deeper neural networks are more difficult to train. We present a residual learning framework to ease the training of networks that are substantially deeper than those used previously. We explicitly reformulate the layers as learning residual functions with reference to the layer inputs, instead of learning unreferenced functions. We provide comprehensive empirical evidence showing that these residual networks are easier to optimize, and can gain accuracy from considerably increased depth. On the ImageNet dataset we evaluate residual nets with a depth of up to 152 layers - 8× deeper than VGG nets [40] but still having lower complexity. An ensemble of these residual nets achieves 3.57% error on the ImageNet test set. This result won the 1st place on the ILSVRC 2015 classification task. We also present analysis on CIFAR-10 with 100 and 1000 layers. The depth of representations is of central importance for many visual recognition tasks. Solely due to our extremely deep representations, we obtain a 28% relative improvement on the COCO object detection dataset. Deep residual nets are foundations of our submissions to ILSVRC & COCO 2015 competitions1, where we also won the 1st places on the tasks of ImageNet detection, ImageNet localization, COCO detection, and COCO segmentation."
math_theory_of_comm = "The recent development of various methods of modulation such as PCM and PPM which exchange bandwidth for signal-to-noise ratio has intensified the interest in a general theory of communication. A basis for such a theory is contained in the important papers of Nyquist 1 and Hartley 2 on this subject. In the present paper we will extend the theory to include a number of new factors, in particular the effect of noise in the channel, and the savings possible due to the statistical structure of the original message and due to the nature of the final destination of the information."
auto_licenseplate_recog_img_process = "A vehicle license plate recognition system is an important proficiency that could be used for identification of engine vehicle all over the earth. It is valuable in numerous applications such as entrance admission, security, parking control, road traffic control, and speed control. However, the system only manages to identify the license number and needs an operator to control the collected data. Therefore, this paper proposes an automatic license plate recognition system by using the image processing and template matching approach. The current study aims to increase the efficiency of license plate recognition system for Universiti Malaysia Perlis (UniMAP) smart university. This venture comprises of simulation program to recognize license plate characters where a captured image of vehicles will be the input. Then, these images will be processed using several image processing techniques and optical character recognition method in order to recognize the segmented number plate. The image processing techniques consist of colour conversion, image segmentation using Otsu's thresholding, noise removal, image subtraction, image cropping and bounding box feature. The optical character recognition based on template matching approach is used to analyse the printed characters on the segmented license plate image and to produce an output data consisting of characters. Overall, the proposed automatic vehicle license plate recognition system is capable to perform the recognition process by successfully recognizing license plate of 13 cars, from a total of 14 cars."
stats_rand_signals = "The autocorrelation and power spectral density functions of a random process are two of the most commonly used concepts in signal processing and in its applications. The relations that define them involve the expected value of a double product of the process or of its Fourier transform. Hence, they are based on second-order statistics. The generalization of this idea leads to the so-called cumulant functions and cumulant spectra, therefore higher-order statistics. Theoretically, the higher-order statistics are null for Gaussian signals. Practically, these quantities are not vanishing. In this paper the third-order statistics for different types of random signals are analyzed."
matplotlib_2dgraphics = "Matplotlib is a 2D graphics package used for Python for application development, interactive scripting,and publication-quality image generation across user interfaces and operating systems"

In [26]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Initialize the model
model = SentenceTransformer('all-MiniLM-L6-v2')

def compute_embeddings(texts):
    """Compute embeddings for a list of texts."""
    embeddings = model.encode(texts, convert_to_tensor=True)
    return embeddings.cpu().numpy()  # Ensure embeddings are on CPU and convert to NumPy array

def get_similarities(embedding, embeddings):
    """Compute cosine similarities between one embedding and a list of embeddings."""
    return cosine_similarity([embedding], embeddings)[0]

def rank_abstracts(query_abstract, abstracts):
    """Rank abstracts by similarity to the query abstract."""
    # Compute embeddings
    all_embeddings = compute_embeddings(abstracts)
    query_embedding = compute_embeddings([query_abstract])[0]
    
    # Compute similarities
    similarities = get_similarities(query_embedding, all_embeddings)
    
    # Get indices of abstracts sorted by similarity
    ranked_indices = np.argsort(similarities)[::-1]
    
    return [(abstracts[i], similarities[i]) for i in ranked_indices]

# Example usage
if __name__ == "__main__":
    # List of research paper abstracts
    abstracts = [
        deep_learn_image_detect, math_theory_of_comm, auto_licenseplate_recog_img_process, stats_rand_signals, matplotlib_2dgraphics
    ]
    
    # Query abstract
    query_abstract = deep_learn_image_detect
    
    # Get ranked abstracts
    ranked_abstracts = rank_abstracts(query_abstract, abstracts)
    
    # Print results
    for abstract, score in ranked_abstracts:
        print(f"Similarity Score: {score:.4f}\nAbstract: {abstract}\n")


Similarity Score: 1.0000
Abstract: Deeper neural networks are more difficult to train. We present a residual learning framework to ease the training of networks that are substantially deeper than those used previously. We explicitly reformulate the layers as learning residual functions with reference to the layer inputs, instead of learning unreferenced functions. We provide comprehensive empirical evidence showing that these residual networks are easier to optimize, and can gain accuracy from considerably increased depth. On the ImageNet dataset we evaluate residual nets with a depth of up to 152 layers - 8× deeper than VGG nets [40] but still having lower complexity. An ensemble of these residual nets achieves 3.57% error on the ImageNet test set. This result won the 1st place on the ILSVRC 2015 classification task. We also present analysis on CIFAR-10 with 100 and 1000 layers. The depth of representations is of central importance for many visual recognition tasks. Solely due to our 

- remove stop words
- stemming 
- tokenization 
- 
- try different vectorization techniques