In [1]:
# Only for installations
!pip install pandas
!pip install numpy
!pip install matplotlib
!pip install nltk
!pip install scikit-learn
!pip install langdetect

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [2]:
import pandas as pd
import numpy as np
import re
import nltk
import string as st
from langdetect import detect, detect_langs
from nltk import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

ModuleNotFoundError: No module named 'sklearn'

In [None]:
nltk.download('all')

## We define here the functions we are going to apply to the dataframes to preprocess the text

### Function that detects the language of a document by its title

In [None]:
def detect_language(text):
    try:
        return detect(text)
    except:
        return 'no'

### Functions to remove punctuations and non alphanumeric characters

In [None]:
# Remove all punctuations from the text

def remove_punct(text):
    return ("".join([ch for ch in text if ch not in st.punctuation]))


def remove_no_alphanumeric(text):
    return ("".join([re.sub(r'\W+', ' ', ch) for ch in text]))


### Function to split the text by words

In [None]:
def tokenize(text):
    text = re.split('\s+' ,text)
    return [x.lower() for x in text]

### Function to remove words with less than 3 characters from the text

In [None]:
def remove_small_words(text):
    return [x for x in text if len(x) > 3 ]

### Function to remove all english stopwords from the text that do not give us information

In [None]:
def remove_stopwords(text):
    return [word for word in text if word not in nltk.corpus.stopwords.words('english')]

### Lemmatization of the text

In [None]:
def lemmatize(text):
    word_net = WordNetLemmatizer()
    return [word_net.lemmatize(word) for word in text]

### Pipeline

In [None]:
def return_sentences(tokens):
    return " ".join([word for word in tokens])

In [None]:
def clean_text(text):
    res = remove_punct(text)
    res = remove_no_alphanumeric(res)
    res = tokenize(res)
    res = remove_small_words(res)
    res = remove_stopwords(res)
    res = lemmatize(res)
    res = return_sentences(res)
    return res

## We read the 3 files of the dataset, the content of these files is:
- **metadata.csv**: Contains the different documents of the dataset with some information such as the title or abstract
- **topics-rnd3.csv**: Contains the different 40 topics of the dataset, which are going to be used as profiles for the IR system
- **qrels.csv**: Contains the relevent judgements for several documents for all the queries/topics

In [None]:
data = pd.read_csv("metadata.csv")
judgements = pd.read_csv("qrels.csv")
topics = pd.read_csv("topics-rnd3.csv")

In [None]:
data.head()

## We delete all the unecessary columns of the documents

In [None]:
data = data.drop(['sha', 'source_x', 'doi', 'pmcid', 'pubmed_id', 'license', 'publish_time', 'authors', 
                  'journal', 'mag_id','who_covidence_id', 'arxiv_id', 'pdf_json_files', 'pmc_json_files',
                   'url', 's2_id'], axis=1)

## All missing and unkown values were removed

In [None]:
data.isnull().sum()

In [None]:
data.replace('Unknown', np.nan, inplace=True)
data = data.dropna()

In [None]:
data

## Since the dataset is really big, we decided to make a sample only choosing those documents that have available relevance judgements for the first 10 topics

In [None]:
topics = topics[topics['topic-id'] <= 10]
topics

In [None]:
judgements = judgements[judgements['topic-id'] <= 10]
cods = judgements['cord-id'].tolist()
judgements

In [None]:
data = data[data['cord_uid'].isin(cods)]

In [None]:
data

## Afterwards, all text that are not written in english are removed

In [None]:
data['lan'] = data['title'].apply(lambda x: detect_language(x))
data

In [None]:
english_docs = data[data['lan'] == 'en']['cord_uid'].tolist()

In [None]:
data = data[data['lan'] == 'en']
judgements = judgements[judgements['cord-id'].isin(english_docs)]

## We apply all the preprocessing steps defined above to the documents

In [None]:
data['clean_text'] = data['abstract'].apply(lambda x: clean_text(x))

In [None]:
data

## We apply the same preprocessing to the different queries

In [None]:
topics = topics[['query']]

In [None]:
topics

In [None]:
topics['clean_query'] = topics['query'].apply(lambda x: clean_text(x))

## The queries are expanded using WordNet Thesaurus

In [None]:
def get_synonims(text):
    res = []
    words = text.split(" ")
    for w in words:
        res.append(w)
        for syn in wordnet.synsets(w):
            for lem in syn.lemmas():
                if lem.name().find("_") < 0 and lem.name() not in res:
                    res.append(lem.name().lower())
    return return_sentences(res)  

In [None]:
topics['synonims'] = topics['clean_query'].apply(lambda x: get_synonims(x))
topics

## The vocabulary present in both, the documents and queries is collected so it is used in both TF-IDF matrix

In [None]:
vocabulary = []
doc_text = data['clean_text'].apply(tokenize).tolist()
for tokens in doc_text:
    for token in tokens:
        if token not in vocabulary:
            vocabulary.append(token)
            
doc_text = topics['synonims'].apply(tokenize).tolist()
for tokens in doc_text:
    for token in tokens:
        if token not in vocabulary:
            vocabulary.append(token)

## We calculate the TF-IDF for both, the documents and queries (in the case of queries idf component is not used)

In [None]:
tfidf = TfidfVectorizer(vocabulary = vocabulary)
tfidf_vect = tfidf.fit_transform(data['clean_text'])
tfidf_vect.shape

In [None]:
tfidf_queries = TfidfVectorizer(vocabulary = vocabulary, use_idf = False)
tfidf_vect_queries = tfidf.fit_transform(topics['synonims'])
tfidf_vect_queries.shape

## Using the TF-IDF, the cosine similarities among documents and queries

In [None]:
similarity_matrix = cosine_similarity(tfidf_vect, tfidf_vect_queries)

In [None]:
queries = topics['query'].tolist()
similarity_df = pd.DataFrame(similarity_matrix, columns = queries)
similarity_df.insert(loc=0, column = 'doc_id', value=data['cord_uid'].tolist())
similarity_df

## A matrix with the relevance judgements for every document and query pair necessary for evaluation is generated

In [None]:
real_values = data[['cord_uid', 'title']]

In [None]:
queries = topics['query'].tolist()
for i in range(len(queries)):
    col = []
    for j in real_values.index:
        cord_id = real_values['cord_uid'][j]
        jud_list = judgements.loc[(judgements['topic-id'] == i+1) & (judgements['cord-id'] == cord_id), 'judgement'].tolist()
        if len(jud_list) > 0:
            col.append(jud_list[0])
        else:
            col.append(-1)
    real_values.insert(loc = i+2, column = queries[i], value=col)
real_values = real_values.replace(2,1)
real_values.drop(['title'], axis=1, inplace=True)

In [None]:
real_values

## Evaluation of the IR system with the different queries

In [None]:
def evaluate(ex,Q,R):
    nq=len(Q)
    nd=len(Q[0])
    R_=np.array(R)
    R_=.5*(R_+1)
    Prec_tot=[]
    Rec_tot=[]
    
    def compute_PR():
        Prec_tot=[]
        Rec_tot=[]        
       
        for q in range(nq):
            q1=q+1
            r=R_[q,:]
            Prec_q=[]
            Rec_q=[]
            for k in range(nd):
                k1=k+1
                Prec=np.sum(r[:k1])/k1
                Rec=np.sum(r[:k1])/np.sum(r)                
    
                Prec_q.append(Prec)
                Rec_q.append(Rec)
            Prec_tot.append(Prec_q)
            Rec_tot.append(Rec_q)
        Prec_tot=np.array(Prec_tot)
        Rec_tot=np.array(Rec_tot)
        return Prec_tot, Rec_tot
    
    
    def compute_TPFP(TP_rate=None):
        TP_tot=[]        
        FP_tot=[]        
        for q in range(nq):
            q1=q+1
            r=R_[q,:]
            nr=1-r
            TP_q=[]
            FP_q=[]
            for k in range(nd):
                k1=k+1
                TP=np.sum(r[:k1])/np.sum(r)                
                FP=np.sum(nr[:k1])/np.sum(nr)
                

                TP_q.append(TP)
                FP_q.append(FP)
            TP_tot.append(TP_q)
            FP_tot.append(FP_q)
        TP_tot=np.array(TP_tot)
        FP_tot=np.array(FP_tot)
        return TP_tot, FP_tot        

    x_axis, y_axis = [],[]
    
    if ex=='prec_rec' or ex=='all':        
        Prec_tot, Rec_tot=compute_PR()
 
        for q in range(nq):
            q1=q+1
     
            Rec_q=Rec_tot[q,:]
            Prec_q=Prec_tot[q,:]

            x_axis.append(np.array(Rec_q))
            y_axis.append(np.array(Prec_q))

    if ex=='r-prec' or ex=='all':        
        if len(Prec_tot):
            Prec_tot, Rec_tot=compute_PR()

        for q in range(nq):            
            Rec_q=Rec_tot[q,:]
            Prec_q=Prec_tot[q,:]
            r=int(np.sum(R_[q]))
            q1=q+1
            print('AP=%.2f'%(Prec_q[r-1]))
            
    if ex=='map' or ex=='all':        
        if len(Prec_tot):
            Prec_tot, Rec_tot=compute_PR()
        APs=[]
        for q in range(nq):            
            Prec_q=Prec_tot[q,:]            
            r=int(np.sum(R_[q]))
            q1=q+1

            rs=np.where(R_[q]==1)[0]+1
            AP=np.mean(Prec_q[np.where(R_[q]==1)])            
            APs.append(AP)        
        print('MAP=%.2f'%(np.mean(np.array(APs))))
        
        
    if ex=='roc' or ex=='all' or ex=='auc':
        TP_tot, FP_tot=compute_TPFP()    
        for q in range(nq):
            q1=q+1
   
            TP_q=TP_tot[q,:]
            FP_q=FP_tot[q,:]
            TP_q_=np.hstack([0,TP_q,1])
            FP_q_=np.hstack([0,FP_q,1])

    
            
            x_axis.append(np.array(FP_q_))
            y_axis.append(np.array(TP_q_))
            
            if ex=='auc' or ex=='all':
                AUC=[]
                for i_x in range(TP_q_.size-1):
                    delta_x=FP_q_[i_x+1]-FP_q_[i_x]
                    base=TP_q_[i_x+1]+TP_q_[i_x]
                    AUC.append(base*delta_x/2)
                AUC=np.array(AUC)
                AUC=AUC[AUC>0]
                print('AUC = %.2f\n\n\n\n'% np.sum(AUC))      
            
    if ex=='clear':
        return
    
    return x_axis, y_axis
            
            

In [None]:

import matplotlib.pyplot as plt

fig_roc, ax_roc = plt.subplots(figsize=(16, 8))
fig_pr, ax_pr = plt.subplots(figsize=(16, 8))
for i in range(len(queries)):
    print("Query " + str(i) + ": " + queries[i])
    
    # Steps:
    # 1. Selection of the query
    # 2. Selection of the non-negatives
    # 3. Merge cosine distances and relevance judgements
    r_i = real_values[real_values[queries[i]] > -1].rename(columns={queries[i]: 'judgement'})
    q_i = similarity_df[similarity_df['doc_id'].isin(r_i['cord_uid'].tolist())].rename(columns = {queries[i]: 'query'})
    r_i = r_i.set_index('cord_uid')[['judgement']]
    q_i = q_i.set_index('doc_id')[['query']]
    rq_i = pd.merge(q_i, r_i, how='left',left_index=True, right_index=True)
    # 4. Sort according to distance
    rq_i = rq_i.sort_values(by='query', ascending = False)
    # 5. Separate and evaluate
    q_i = rq_i[['query']]
    r_i = rq_i[['judgement']]
    r_i_np = r_i.T.to_numpy()
    q_i_np = q_i.T.to_numpy()
    
    eval = 'all'
    x_axis, y_axis = evaluate(eval, q_i_np, r_i_np)

         
    if eval == 'prec_rec' or eval == 'all':  
        ax_pr.plot(x_axis[0], y_axis[0])

    if eval == 'auc' or eval == 'roc' or eval == 'all':
        if eval == 'all':
            ax_roc.plot(x_axis[1], y_axis[1])
        else: 
            ax_roc.plot(x_axis[0], y_axis[0])

ax_pr.set_xlabel('Recall')
ax_pr.set_ylabel('Precision')
ax_pr.legend([f'{queries[i]}' for i in range(len(queries))])
fig_pr.show()


ax_roc.set_xlabel('FPR')
ax_roc.set_ylabel('TPR')
ax_roc.legend([f'{queries[i]}' for i in range(len(queries))])
fig_roc.show()
    

# Comparison

## Basic Preprocessing

In [None]:
moviesdf = pd.read_csv('wiki-movies.csv')
moviesdf.head()

In [None]:
#moviesdf.info()

### Check Categories 

- There are some repeated links and unknowns. Therefore, we will replace the unknowns by Nans and remove the links.
- There are films from many Ethnicities. We will focus on the English-speaking ones.

In [None]:
# for col in moviesdf.columns:
#    print(f'=== {col} ===')
#    print(moviesdf[col].value_counts(),'\n\n')

In [None]:
moviesdf = moviesdf[moviesdf['Origin/Ethnicity'].isin(['American','British','Canadian','Australian'])]

In [None]:
moviesdf['Director'] = moviesdf['Director'].apply(lambda x: x.lower())
moviesdf.replace('unknown', np.nan, inplace=True)

In [None]:
moviesdf = moviesdf.drop_duplicates(subset=['Wiki Page'])
moviesdf = moviesdf.drop_duplicates(subset=['Plot'])
#moviesdf['Wiki Page'].value_counts()

### Select and balance Genre categories

- Simplifico quedandome con las top categorias mas frequentes > 500
- Balanceo

In [None]:
moviesdf['Genre'].value_counts()[:15]

In [None]:
topGenres = moviesdf['Genre'].value_counts()[:8].index

In [None]:
from sklearn.utils import resample

drama_comedy = moviesdf[moviesdf['Genre'].isin(['drama','comedy'])]
rest = moviesdf[moviesdf['Genre'].isin(topGenres.drop(['drama','comedy']))]
 
# Downsample mayority class
downsampled = resample(drama_comedy, 
                        replace=False,    
                        n_samples=2000,    
                        random_state=123)   

moviesTOP = pd.concat([downsampled, rest],axis=0)
moviesTOP['Genre'].value_counts()

### Column concatenate
- We will concatenate the columns to have  common text, where we will drop the nans if needed.

In [None]:
namecol = moviesTOP.columns.drop(['Origin/Ethnicity','Release Year','Wiki Page'])
moviesTOP['Message'] = moviesTOP[namecol].apply(lambda x: ' '.join(x.dropna().astype(str)),axis=1)

In [None]:
moviesTOP.head()

In [None]:
data = moviesTOP[['Message','Genre']].copy()
data

- Queries

In [None]:
relevance_jud = {"preferences": [[" drama", "thriller"],
                         [" horror", "adventure"],
                         [" comedy", "crime"],
                         [" western", "action"],
                         [" adventure", "crime"]]}
relevance_jud = pd.DataFrame(relevance_jud)
relevance_jud['preferences'] = relevance_jud['preferences'].apply(lambda x: ", ".join(x))
relevance_jud

## Text Preparation

In [None]:
data['clean_text'] = data['Message'].apply(lambda x: clean_text(x))
[['clean_text']]

In [None]:
relevance_jud['clean_query'] = relevance_jud['preferences'].apply(lambda x: clean_text(x))
relevance_jud['synonims'] = relevance_jud['clean_query'].apply(lambda x: get_synonims(x))
relevance_jud[['synonims']]

- Vocabulary

In [None]:
vocabulary = []
doc_text = data['clean_text'].apply(tokenize).tolist()
for tokens in doc_text:
    for token in tokens:
        if token not in vocabulary:
            vocabulary.append(token)

In [None]:
doc_text = relevance_jud['synonims'].apply(tokenize).tolist()
for tokens in doc_text:
    for token in tokens:
        if token not in vocabulary:
            vocabulary.append(token)

* TF-IDF

In [None]:
tfidf = TfidfVectorizer(vocabulary = vocabulary)
tfidf_vect = tfidf.fit_transform(data['clean_text'])
tfidf_vect.shape

In [None]:
tfidf_queries = TfidfVectorizer(vocabulary = vocabulary, use_idf = False)
tfidf_vect_queries = tfidf.fit_transform(relevance_jud['synonims'])
tfidf_vect_queries.shape

- Cosine similarities

In [None]:
similarity_matrix = cosine_similarity(tfidf_vect, tfidf_vect_queries)

In [None]:
queries = relevance_jud['preferences'].tolist()
similarity_df = pd.DataFrame(similarity_matrix, columns = queries)
#similarity_df.insert(loc=0, column = 'doc_id', value=data['cord_uid'].tolist())
similarity_df

- relevance judgements matrix

In [None]:
data = data.reset_index()
data.drop(['index'],axis=1,inplace=True)

In [None]:
users_rels = np.zeros((data.shape[0], relevance_jud.shape[0]))

for n, pref in enumerate(relevance_jud.iterrows()):
    gust = pref[1]['preferences'].split(',')
    gust = [g[1:] for g in gust]
    user_i = data[data['Genre'].isin(gust)]
    for i in user_i.index:
        users_rels[i,n] = 1

In [None]:
real_values = pd.DataFrame(users_rels, columns=queries).astype(int)
real_values

- Evaluation

In [None]:
fig_roc, ax_roc = plt.subplots(figsize=(16, 8))
fig_pr, ax_pr = plt.subplots(figsize=(16, 8))
for i in range(len(queries)):
    print("Query " + str(i) + ": " + queries[i])
    
    # Steps:
    # 1. Selection of the query
    # 2. Selection of the non-negatives
    # 3. Merge cosine distances and relevance judgements
    r_i = real_values.rename(columns={queries[i]: 'judgement'})
    q_i = similarity_df.rename(columns = {queries[i]: 'query'})
    r_i = r_i[['judgement']]
    q_i = q_i[['query']]
    rq_i = pd.merge(q_i, r_i, how='left',left_index=True, right_index=True)
    # 4. Sort according to distance
    rq_i = rq_i.sort_values(by='query', ascending = False)
    # 5. Separate and evaluate
    q_i = rq_i[['query']]
    r_i = rq_i[['judgement']]
    r_i_np = r_i.T.to_numpy()
    q_i_np = q_i.T.to_numpy()
    
    eval = 'all'
    x_axis, y_axis = evaluate(eval, q_i_np, r_i_np)

         
    if eval == 'prec_rec' or eval == 'all':  
        ax_pr.plot(x_axis[0], y_axis[0])

    if eval == 'auc' or eval == 'roc' or eval == 'all':
        if eval == 'all':
            ax_roc.plot(x_axis[1], y_axis[1])
        else: 
            ax_roc.plot(x_axis[0], y_axis[0])

ax_pr.set_xlabel('Recall')
ax_pr.set_ylabel('Precision')
ax_pr.legend([f'{queries[i]}' for i in range(len(queries))])
fig_pr.show()


ax_roc.set_xlabel('FPR')
ax_roc.set_ylabel('TPR')
ax_roc.legend([f'{queries[i]}' for i in range(len(queries))])
fig_roc.show()