In [5]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import rankdata

'''Packages for preprocessing'''
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
import re
import nltk 
from nltk.tokenize import word_tokenize,sent_tokenize

In [6]:
# Load CSV file into Pandas dataframe
df = pd.read_csv('Dataset_1.csv', encoding='ISO-8859-1')

In [7]:
'''THis block of code pre processes the data'''# stemmer = SnowballStemmer("english") 
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def clean_text(text, for_embedding=False):
    """
        - remove any html tags (< /br> often found)
        - Keep only ASCII + European Chars and whitespace, no digits
        - remove single letter chars
        - convert all whitespaces (tabs etc.) to single wspace
        if not for embedding (but e.g. tdf-idf):
        - all lowercase
        - remove stopwords, punctuation and stemm
    """
    RE_WSPACE = re.compile(r"\s+", re.IGNORECASE) #remove white space
    RE_TAGS = re.compile(r"<[^>]+>") #removes tags
    RE_ASCII = re.compile(r"[^A-Za-zÀ-ž ]", re.IGNORECASE) #keep only asscii character
    RE_SINGLECHAR = re.compile(r"\b[A-Za-zÀ-ž]\b", re.IGNORECASE)
    if for_embedding:
        # Keep punctuation
        RE_ASCII = re.compile(r"[^A-Za-zÀ-ž,.!? ]", re.IGNORECASE)
        RE_SINGLECHAR = re.compile(r"\b[A-Za-zÀ-ž,.!?]\b", re.IGNORECASE)

    text = re.sub(RE_TAGS, " ", str(text))
    text = re.sub(RE_ASCII, " ", str(text))
    text = re.sub(RE_SINGLECHAR, " ", str(text))
    text = re.sub(RE_WSPACE, " ", str(text))

    word_tokens = word_tokenize(text)
    words_tokens_lower = [word.lower() for word in word_tokens]

    if for_embedding:
        # no stemming, lowering and punctuation / stop words removal
        words_filtered = word_tokens
    else:
        words_filtered = [lemmatizer.lemmatize(w) for w in words_tokens_lower if w not in stop_words]
        
#         words_filtered = [
#             stemmer.stem(word) for word in words_tokens_lower if word not in stop_words
#         ]

    text_clean = " ".join(words_filtered)
    return text_clean


In [20]:
#Preprocess text data
df['Document Title'] = df['Document Title'].apply(clean_text)
df['Abstract'] = df['Abstract'].apply(clean_text)
query = input('Enter query: ')
query_processed = clean_text(query)

Enter query: computer


In [21]:
vectorizer = TfidfVectorizer()
document_vectors = vectorizer.fit_transform(df['Document Title'] + ' ' + df['Abstract'])
query_vectore = vectorizer.transform([query_processed])
cosine_similarities = cosine_similarity(query_vectore, document_vectors)

In [25]:
# create a new dataframe to store the results
result_df = pd.DataFrame({
    'title': df['Document Title'],
    'abstract': df['Abstract'],
    'url': df['PDF Link'],
    'year': df['Year'],
    'similarity_score': cosine_similarities.flatten()
})

result_df = result_df.sort_values(by=['similarity_score'], ascending=False)
result_df['ranking'] = np.arange(len(result_df)) + 1
result_df


Unnamed: 0,title,abstract,url,year,similarity_score,ranking
21,self testing computer,built test technique exploit hardware redundan...,http://ieeexplore.ieee.org/stamp/stamp.jsp?arn...,1979,0.453355,1
32,fault tolerant computer system,paper review method reliable processing contro...,http://ieeexplore.ieee.org/stamp/stamp.jsp?arn...,1981,0.357799,2
1191,fault tolerant computer automated transfer veh...,matra marconi space developing fourth generati...,http://ieeexplore.ieee.org/stamp/stamp.jsp?arn...,1998,0.324550,3
89,expert system high voltage discharge test,technical advance reduction cost computer hard...,http://ieeexplore.ieee.org/stamp/stamp.jsp?arn...,1987,0.293480,4
0,diagnostic maintenance technique using computer,possible technique attending software need adv...,http://ieeexplore.ieee.org/stamp/stamp.jsp?arn...,1963,0.290579,5
...,...,...,...,...,...,...
2141,approach intelligent detection fault diagnosis...,paper approach intelligent detection fault dia...,http://ieeexplore.ieee.org/stamp/stamp.jsp?arn...,2002,0.000000,5996
2140,umts easycope tool umts network algorithm eval...,umts radio access network problem dimensioning...,http://ieeexplore.ieee.org/stamp/stamp.jsp?arn...,2002,0.000000,5997
2139,test case prioritization family empirical study,reduce cost regression testing software tester...,http://ieeexplore.ieee.org/stamp/stamp.jsp?arn...,2002,0.000000,5998
2138,body knowledge software quality measurement,measuring quality key developing high quality ...,http://ieeexplore.ieee.org/stamp/stamp.jsp?arn...,2002,0.000000,5999
