In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import torch
from transformers import BertTokenizer, BertModel
import re
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import WordNetLemmatizer
import yake
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize, MinMaxScaler

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

# nlp = spacy.load('en_core_web_sm')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2))  # Bigrams included

[nltk_data] Downloading package punkt to /Users/rajkhera/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rajkhera/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/rajkhera/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/rajkhera/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
deep_learn_image_detect = "Deeper neural networks are more difficult to train. We present a residual learning framework to ease the training of networks that are substantially deeper than those used previously. We explicitly reformulate the layers as learning residual functions with reference to the layer inputs, instead of learning unreferenced functions. We provide comprehensive empirical evidence showing that these residual networks are easier to optimize, and can gain accuracy from considerably increased depth. On the ImageNet dataset we evaluate residual nets with a depth of up to 152 layers - 8× deeper than VGG nets [40] but still having lower complexity. An ensemble of these residual nets achieves 3.57% error on the ImageNet test set. This result won the 1st place on the ILSVRC 2015 classification task. We also present analysis on CIFAR-10 with 100 and 1000 layers. The depth of representations is of central importance for many visual recognition tasks. Solely due to our extremely deep representations, we obtain a 28% relative improvement on the COCO object detection dataset. Deep residual nets are foundations of our submissions to ILSVRC & COCO 2015 competitions1, where we also won the 1st places on the tasks of ImageNet detection, ImageNet localization, COCO detection, and COCO segmentation."
math_theory_of_comm = "The recent development of various methods of modulation such as PCM and PPM which exchange bandwidth for signal-to-noise ratio has intensified the interest in a general theory of communication. A basis for such a theory is contained in the important papers of Nyquist 1 and Hartley 2 on this subject. In the present paper we will extend the theory to include a number of new factors, in particular the effect of noise in the channel, and the savings possible due to the statistical structure of the original message and due to the nature of the final destination of the information."
auto_licenseplate_recog_img_process = "A vehicle license plate recognition system is an important proficiency that could be used for identification of engine vehicle all over the earth. It is valuable in numerous applications such as entrance admission, security, parking control, road traffic control, and speed control. However, the system only manages to identify the license number and needs an operator to control the collected data. Therefore, this paper proposes an automatic license plate recognition system by using the image processing and template matching approach. The current study aims to increase the efficiency of license plate recognition system for Universiti Malaysia Perlis (UniMAP) smart university. This venture comprises of simulation program to recognize license plate characters where a captured image of vehicles will be the input. Then, these images will be processed using several image processing techniques and optical character recognition method in order to recognize the segmented number plate. The image processing techniques consist of colour conversion, image segmentation using Otsu's thresholding, noise removal, image subtraction, image cropping and bounding box feature. The optical character recognition based on template matching approach is used to analyse the printed characters on the segmented license plate image and to produce an output data consisting of characters. Overall, the proposed automatic vehicle license plate recognition system is capable to perform the recognition process by successfully recognizing license plate of 13 cars, from a total of 14 cars."
stats_rand_signals = "The autocorrelation and power spectral density functions of a random process are two of the most commonly used concepts in signal processing and in its applications. The relations that define them involve the expected value of a double product of the process or of its Fourier transform. Hence, they are based on second-order statistics. The generalization of this idea leads to the so-called cumulant functions and cumulant spectra, therefore higher-order statistics. Theoretically, the higher-order statistics are null for Gaussian signals. Practically, these quantities are not vanishing. In this paper the third-order statistics for different types of random signals are analyzed."
matplotlib_2dgraphics = "Matplotlib is a 2D graphics package used for Python for application development, interactive scripting,and publication-quality image generation across user interfaces and operating systems"
neural_networks_quantization_refine = "Deploying neural networks (NNs) in low-resource domains is challenging because of their high computing, memory, and power requirements. For this reason, NNs are often quantized before deployment, but such an approach degrades their accuracy. Thus, we propose the counterexample-guided neural network quantization refinement (CEG4N) framework, which combines search-based quantization and equivalence checking. The former minimizes computational requirements, while the latter guarantees that the behavior of an NN does not change after quantization. We evaluate CEG4N on a diverse set of benchmarks, including large and small NNs. Our technique successfully quantizes the networks in the chosen evaluation set, while producing models with up to 163% better accuracy than state-of-the-art techniques."

In [None]:
def preprocess_text(text):

    # Convert to lowercase
    text = text.lower()
    
    # Replace hyphens with space
    text = text.replace('-', ' ')

    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\b\w*\d+\w*\b', '', text)
    
    # Tokenize text
    tokens = word_tokenize(text)
    
    # Remove stop words
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatize tokens
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Reassemble text
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

In [None]:
def compute_embeddings(texts):

    preprocessed_texts = [preprocess_text(text) for text in texts]

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(preprocessed_texts)
    
    # Convert to embeddings using SentenceTransformer
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = model.encode(preprocessed_texts, convert_to_tensor=True)
    return embeddings.cpu().numpy()  # Ensure embeddings are on CPU and convert to NumPy array

def get_similarities(embedding, embeddings):

    return cosine_similarity([embedding], embeddings)[0]

def rank_abstracts(query_abstract, abstracts):

    # Preprocess texts
    preprocessed_abstracts = [preprocess_text(text) for text in abstracts]
    preprocessed_query_abstract = preprocess_text(query_abstract)
    
    # Compute embeddings
    all_embeddings = compute_embeddings(abstracts)
    query_embedding = compute_embeddings([query_abstract])[0]
    
    # Compute similarities
    similarities = get_similarities(query_embedding, all_embeddings)

    # Get indices of abstracts sorted by similarity
    ranked_indices = np.argsort(similarities)[::-1]
    
    # Return both original and preprocessed abstracts along with similarity scores
    return [
        (abstracts[i], preprocessed_abstracts[i], similarities[i]) 
        for i in ranked_indices
    ]

# Example usage
if __name__ == "__main__":

    # List of research paper abstracts
    abstracts = [
        deep_learn_image_detect, math_theory_of_comm, auto_licenseplate_recog_img_process, stats_rand_signals, matplotlib_2dgraphics, neural_networks_quantization_refine
    ]

    # Query abstract
    query_abstract = deep_learn_image_detect
    
    # Get ranked abstracts
    ranked_abstracts = rank_abstracts(query_abstract, abstracts)
    
    # Print results
    for original, preprocessed, score in ranked_abstracts:
        print(f"Similarity Score: {score:.4f}")
        print(f"Original Abstract: {original}")
        print(f"Preprocessed Abstract: {preprocessed}\n")



Similarity Score: 1.0000
Original Abstract: Deeper neural networks are more difficult to train. We present a residual learning framework to ease the training of networks that are substantially deeper than those used previously. We explicitly reformulate the layers as learning residual functions with reference to the layer inputs, instead of learning unreferenced functions. We provide comprehensive empirical evidence showing that these residual networks are easier to optimize, and can gain accuracy from considerably increased depth. On the ImageNet dataset we evaluate residual nets with a depth of up to 152 layers - 8× deeper than VGG nets [40] but still having lower complexity. An ensemble of these residual nets achieves 3.57% error on the ImageNet test set. This result won the 1st place on the ILSVRC 2015 classification task. We also present analysis on CIFAR-10 with 100 and 1000 layers. The depth of representations is of central importance for many visual recognition tasks. Solely du

In [None]:
df = pd.read_csv("/Users/rajkhera/Book-Recommendations-1/Webscraping/publications.csv")

FileNotFoundError: [Errno 2] No such file or directory: '/Users/rajkhera/Book-Recommendations-1/Webscraping/publications.csv'

In [None]:
df.head()

Unnamed: 0,Title,Author,Abstract,Year,Journal/Conference Name,Conference or Journal,Publisher
0,Deep Residual Learning for Image Recognition,"Xiangyu Zhang, Shaoqing Ren, Jian Sun, Kaiming He",Deeper neural networks are more difficult to t...,2016,2016 IEEE Conference on Computer Vision and Pa...,Conference Paper,IEEE
1,A mathematical theory of communication,C. E. Shannon,The recent development of various methods of m...,1948,The Bell System Technical Journal,Journal Article,Nokia Bell Labs
2,A new look at the statistical model identifica...,H. Akaike,The history of the development of statistical ...,1974,IEEE Transactions on Automatic Control,Journal Article,IEEE
3,Image quality assessment: from error visibilit...,"H.R. Sheikh, A.C. Bovik, Zhou Wang, E.P. Simon...",Objective methods for assessing perceptual ima...,2004,IEEE Transactions on Image Processing,Journal Article,IEEE
4,Gradient-based learning applied to document re...,"Y. Bengio, Y. Lecun, P. Haffner, L. Bottou",Multilayer neural networks trained with the ba...,1998,Proceedings of the IEEE,Journal Article,IEEE


In [None]:
title_list = df['Title'].to_list()

In [None]:
title_list

['Deep Residual Learning for Image Recognition',
 'A mathematical theory of communication',
 'A new look at the statistical model identification',
 'Image quality assessment: from error visibility to structural similarity',
 'Gradient-based learning applied to document recognition',
 'ImageNet: A large-scale hierarchical image database',
 'A fast and elitist multiobjective genetic algorithm: NSGA-II',
 'Particle swarm optimization',
 'Going deeper with convolutions',
 'You Only Look Once: Unified, Real-Time Object Detection',
 'Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks',
 'Densely Connected Convolutional Networks',
 'Fully convolutional networks for semantic segmentation',
 'Compressed sensing',
 'A Computational Approach to Edge Detection',
 'Long Short-Term Memory',
 'Fast R-CNN',
 'Rich Feature Hierarchies for Accurate Object Detection and Semantic Segmentation',
 'Mask R-CNN',
 'Histograms of oriented gradients for human detection',
 'Textural F