## distribute data

### data info

In [186]:
CISIDATA = '../../../../CISI/CISI.ALL'
import re
IDMarker = re.compile('(\.I.)')
allMarkers = re.compile('(\.[ITABWX] )')

### queries info

In [187]:
CISIQUERY = '../../../../CISI/CISI.QRY'
CISIQRELS = '../../../../CISI/CISI.REL'
import re
queryMarkers = re.compile('(\.[ITAWB] )')

In [188]:
def getData(PATH, marker):
    """get the data from the file and split it by ID"""
    with open(PATH, 'r') as f:
        t = f.read().replace('\n', ' ')
        lines = re.split(marker, t)
        lines.pop(0)
    return lines

### converte CISI.ALL

In [189]:
cisiData = getData(CISIDATA, allMarkers)

In [190]:
import pandas as pd
dataFrame = pd.DataFrame(columns=['.I','.T','.A','.B','.W','.X'])
seriesDict:dict = {
    '.I': None,
    '.T': None,
    '.A': None,
    '.B': None,
    '.W': None,
    '.X': None
}
seriesData = seriesDict.copy()
notTheFirst = False
for i in range(0, len(cisiData), 2):
    if (notTheFirst and cisiData[i].strip() == '.I'):
        dataFrame = dataFrame.append(seriesData, ignore_index=True)
        seriesData = seriesDict.copy()
    
    seriesData[cisiData[i].strip()] = cisiData[i+1].strip()
    notTheFirst = True
dataFrame = dataFrame.append(seriesData, ignore_index=True)
dataFrame.set_index('.I', inplace=True)
dataFrame.head()

Unnamed: 0_level_0,.T,.A,.B,.W,.X
.I,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,18 Editions of the Dewey Decimal Classifications,"Comaromi, J.P.",,The present study is a history of the DEWEY De...,1\t5\t1 92\t1\t1 262\t1\t1 556\t1\t1 1004\t1\t...
2,Use Made of Technical Libraries,"Slater, M.",,This report is an analysis of 6300 acts of use...,2\t5\t2 32\t1\t2 76\t1\t2 132\t1\t2 137\t1\t2 ...
3,Two Kinds of Power An Essay on Bibliographic C...,"Wilson, P.",,The relationships between the organization and...,3\t7\t3 42\t1\t3 172\t1\t3 268\t1\t3 292\t1\t3...
4,Systems Analysis of a University Library; fin...,"Buckland, M.K.",,The establishment of nine new universities in ...,4\t10\t4 5\t2\t4 9\t1\t4 32\t1\t4 65\t1\t4 96\...
5,A Library Management Game: a report on a resea...,"Brophy, P.",,Although the use of games in professional educ...,4\t2\t5 5\t6\t5 90\t1\t5 91\t1\t5 115\t1\t5 15...


In [191]:
dataFrame.to_csv('../../cisiData/cisiCsv.csv')

### converte query.text

In [192]:
cisiQuery = getData(CISIQUERY, queryMarkers)

In [193]:
import pandas as pd
qDataFrame = pd.DataFrame(columns=['.I','.T','.A','.W','.B'])
seriesDict:dict = {
    '.I': None,
    '.T': None,
    '.A': None,
    '.W': None,
    '.B': None
}
seriesData = seriesDict.copy()
notTheFirst = False
for i in range(0, len(cisiQuery), 2):
    if (notTheFirst and cisiQuery[i].strip() == '.I'):
        qDataFrame = qDataFrame.append(seriesData, ignore_index=True)
        seriesData = seriesDict.copy()
    
    seriesData[cisiQuery[i].strip()] = cisiQuery[i+1].strip()
    notTheFirst = True
qDataFrame = qDataFrame.append(seriesData, ignore_index=True)
qDataFrame.set_index('.I', inplace=True)
qDataFrame.head()

Unnamed: 0_level_0,.T,.A,.W,.B
.I,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,,,What problems and concerns are there in making...,
2,,,"How can actually pertinent data, as opposed to...",
3,,,What is information science? Give definitions...,
4,,,Image recognition and any other methods of aut...,
5,,,What special training will ordinary researcher...,


In [194]:
qDataFrame.to_csv('../../cisiData/cisiQueryCsv.csv')

### converte qrels.text

In [195]:
import pandas as pd

def getRles(path):
    with open(path, 'r') as f:
        global qrlesList
        qrlesList = f.read().split('\n')
        return qrlesList

qrelsData = getRles(CISIQRELS)
qrelsFrame = pd.DataFrame(columns=['.I', 'data'])
seriesDict:dict = {'.I':None, 'data':None}
seriesData = seriesDict.copy()
for i in qrelsData:
    try:
        element = i.split()
        seriesData['.I'] = int(element[0])
        seriesData['data'] = int(element[1])
        qrelsFrame = qrelsFrame.append(seriesData, ignore_index=True)
    except:
        pass
qrelsFrame.head()

Unnamed: 0,.I,data
0,1,28
1,1,35
2,1,38
3,1,42
4,1,43


In [196]:
qrelsFrame.to_csv('../../cisiData/cisiQRels.csv')

## clean preproccesing

### CISI

In [197]:
import pandas as pd
df:pd.DataFrame = pd.read_csv('../../cisiData/cisiCsv.csv', index_col='.I')
df.head()

Unnamed: 0_level_0,.T,.A,.B,.W,.X
.I,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,18 Editions of the Dewey Decimal Classifications,"Comaromi, J.P.",,The present study is a history of the DEWEY De...,1\t5\t1 92\t1\t1 262\t1\t1 556\t1\t1 1004\t1\t...
2,Use Made of Technical Libraries,"Slater, M.",,This report is an analysis of 6300 acts of use...,2\t5\t2 32\t1\t2 76\t1\t2 132\t1\t2 137\t1\t2 ...
3,Two Kinds of Power An Essay on Bibliographic C...,"Wilson, P.",,The relationships between the organization and...,3\t7\t3 42\t1\t3 172\t1\t3 268\t1\t3 292\t1\t3...
4,Systems Analysis of a University Library; fin...,"Buckland, M.K.",,The establishment of nine new universities in ...,4\t10\t4 5\t2\t4 9\t1\t4 32\t1\t4 65\t1\t4 96\...
5,A Library Management Game: a report on a resea...,"Brophy, P.",,Although the use of games in professional educ...,4\t2\t5 5\t6\t5 90\t1\t5 91\t1\t5 115\t1\t5 15...


In [198]:
df.describe()

Unnamed: 0,.T,.A,.B,.W,.X
count,1460,1460,24,1460,1460
unique,1431,1194,17,1459,1460
top,Progress in Documentation,"Lancaster, F.W.",1970,The essentially logistical problem of making l...,1\t5\t1 92\t1\t1 262\t1\t1 556\t1\t1 1004\t1\t...
freq,5,9,4,2,1


In [199]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 1 to 1460
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   .T      1460 non-null   object
 1   .A      1460 non-null   object
 2   .B      24 non-null     object
 3   .W      1460 non-null   object
 4   .X      1460 non-null   object
dtypes: object(5)
memory usage: 68.4+ KB


In [200]:
print(df.loc[:,'.T'].isnull().value_counts(), end='\n\n')
print(df.loc[:,'.W'].isnull().value_counts(), end='\n\n')
print(df.loc[:,'.A'].isnull().value_counts())

False    1460
Name: .T, dtype: int64

False    1460
Name: .W, dtype: int64

False    1460
Name: .A, dtype: int64


In [201]:
df.fillna('', inplace=True)

In [202]:
df.duplicated().value_counts()

False    1460
dtype: int64

In [203]:
df.to_csv('../../cisiData/cisiDataCleaned.csv')

#### methods

##### lowercase

In [204]:
def toLower(text):
    return text.lower()

##### Numbers to words

In [205]:
import inflect
p = inflect.engine()

import re
reg = r'([0-9]+)'

def isFLoat(strNum):
    try:
        float(strNum)
        return True
    except:
        return False


def converteNumbers(text):
    tempText = text.split()
    newText = []
    for word in tempText:
        tempList = re.split(reg,word)
        for miniWord in tempList:
            if miniWord.isdigit() or isFLoat(miniWord):
                temp = p.number_to_words(miniWord)
                newText.append(removePunctuation(temp))
            else:
                newText.append(miniWord)        
    tempText = ' '.join(newText)
    return tempText
    # return text

##### remove punctuation

In [206]:
import string
translator = str.maketrans(string.punctuation,' '*len(string.punctuation))
def removePunctuation(text):
    global translator
    return text.translate(translator)


##### remove whitespaces

In [207]:
def removeWhiteSpace(text):
    return " ".join(text.split())

##### remove stop words

In [208]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def removeStopWords(text):
    sw = set(stopwords.words("english"))
    wt = word_tokenize(text)
    filteredText = [word for word in wt if word not in sw]
    return ' '.join(filteredText)
    # return text

##### stemming

In [209]:
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize

stemmer = PorterStemmer()
def stemWords(text):
    global stemmer
    wt = word_tokenize(text)
    stems = []
    for word in wt:
        temp = stemmer.stem(word)
        # if not temp == word:
        #     temp = correctWords(temp)
        stems.append(temp)
    return ' '.join(stems)
    # return text

##### lemmatization

In [210]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet as wn
from nltk import pos_tag, defaultdict

lemmatizer = WordNetLemmatizer()

tag_map = defaultdict(lambda: wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV




def lemmatizeWords(text):
    # wt = word_tokenize(text)
    # lemmas = [lemmatizer.lemmatize(word, pos='a') for word in wt]
    # return ' '.join(lemmas)
    # return text

    tokens = word_tokenize(text)
    lmtzr = WordNetLemmatizer()
    lemmas = [lmtzr.lemmatize(token, tag_map[tag[0]]) for token, tag in pos_tag(tokens) ]
    return ' '.join(lemmas)
# lemmatizeWords('hard')

##### correcting

In [211]:
from nltk.metrics.distance import jaccard_distance
from nltk.util import ngrams
from nltk.corpus import words
correct_words = words.words()
incorrectWords = '''preliminari'''.split()
result = []
def correctWords(text):
    for word in text:
        try:
            temp = [(jaccard_distance(set(ngrams(word, 2)),
                                      set(ngrams(w, 2))),w)
                                      for w in correct_words if w[0] == word[0]]
            result.append(sorted(temp, key = lambda val:val[0])[0][1])
        except:
            pass
    return ' '.join(result)
# correctWords(incorrectWords)

#### cisi process

In [23]:
import pandas as pd
import re

def TitlePreProcesse(t):
    tempText = toLower(t)
    tempText = removePunctuation(tempText)
    tempText = converteNumbers(tempText)
    tempText = removeWhiteSpace(tempText)
    tempText = removeStopWords(tempText)
    tempText = stemWords(tempText)
    tempText = lemmatizeWords(tempText)
    return tempText

def abstractPreProcesse(a):
    tempText = toLower(a)
    tempText = removePunctuation(tempText)
    tempText = converteNumbers(tempText)
    tempText = removeWhiteSpace(tempText)
    tempText = removeStopWords(tempText)
    tempText = stemWords(tempText)
    tempText = lemmatizeWords(tempText)
    return tempText

# i didn't do it yet on cisi (converte date to timestamp)
def publicationPreProcesse(p): 
    # tempText = p.replace('cisi ','')
    # return pd.to_datetime(tempText)
    return p
    
def authorPreProcesse(a):
    tempText = toLower(a)
    lis = tempText.split(' ')
    names = ' '
    l = []
    for word in lis:
      if ',' in word:
          l.append(removePunctuation(word.replace(',','')))
    names = ' '.join(l)
    return names


In [40]:
import pandas as pd
def preprocessedData(dataFrame:pd.DataFrame):
    pdataFrame = pd.DataFrame()
    seriesDict:dict = {} 
    seriesData = seriesDict.copy()
    for i in dataFrame.index:
        try:
            templist = []
            tempT = tempA = tempB = tempW = None
            if not dataFrame.loc[i, '.T'] == '':
                # templist.append(TitlePreProcesse(dataFrame.loc[i, '.T']))
                tempT = TitlePreProcesse(dataFrame.loc[i, '.T'])
            if not dataFrame.loc[i, '.A'] == '':
                # templist.append(authorPreProcesse(dataFrame.loc[i, '.A']))
                tempA = authorPreProcesse(dataFrame.loc[i, '.A'])
            if not dataFrame.loc[i, '.B'] == '':
                tempB = publicationPreProcesse(dataFrame.loc[i, '.B'])
            if not dataFrame.loc[i, '.W'] == '':
                # templist.append(abstractPreProcesse(dataFrame.loc[i, '.W']))
                tempW = abstractPreProcesse(dataFrame.loc[i, '.W'])

            # if not dataFrame.loc[i, '.K'] == '':
            #     templist.append(kPreProcesse(dataFrame.loc[i, '.K']))
            # if not dataFrame.loc[i, '.C'] == '':
            #     tempC = cPreProcesse(dataFrame.loc[i, '.C'])


            seriesData['.I'] = i
            # seriesData['data'] = ' '.join(templist)
            seriesData['.T'] = tempT
            seriesData['.A'] = tempA
            seriesData['.B'] = tempB
            seriesData['.W'] = tempW
            
            pdataFrame = pdataFrame.append(seriesData, ignore_index=True)
        except:
            print(i)
            raise 
    
    pdataFrame.set_index('.I', inplace=True)
    pdataFrame.fillna('', inplace=True)
    return pdataFrame


In [41]:
import pandas as pd
data = pd.read_csv('../../cisiData/cisiDataCleaned.csv')
data.set_index('.I', inplace=True)
data.fillna('', inplace=True)
data.head()

Unnamed: 0_level_0,.T,.A,.B,.W,.X
.I,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,18 Editions of the Dewey Decimal Classifications,"Comaromi, J.P.",,The present study is a history of the DEWEY De...,1\t5\t1 92\t1\t1 262\t1\t1 556\t1\t1 1004\t1\t...
2,Use Made of Technical Libraries,"Slater, M.",,This report is an analysis of 6300 acts of use...,2\t5\t2 32\t1\t2 76\t1\t2 132\t1\t2 137\t1\t2 ...
3,Two Kinds of Power An Essay on Bibliographic C...,"Wilson, P.",,The relationships between the organization and...,3\t7\t3 42\t1\t3 172\t1\t3 268\t1\t3 292\t1\t3...
4,Systems Analysis of a University Library; fin...,"Buckland, M.K.",,The establishment of nine new universities in ...,4\t10\t4 5\t2\t4 9\t1\t4 32\t1\t4 65\t1\t4 96\...
5,A Library Management Game: a report on a resea...,"Brophy, P.",,Although the use of games in professional educ...,4\t2\t5 5\t6\t5 90\t1\t5 91\t1\t5 115\t1\t5 15...


In [42]:
processedDAta = preprocessedData(data)
processedDAta.head()

Unnamed: 0_level_0,.T,.A,.B,.W
.I,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1.0,eighteen edit dewey decim classif,comaromi,,present studi histori dewey decim classif firs...
2.0,use make technic librari,slater,,report analysi six thousand three hundr act us...
3.0,two kind power essay bibliograph control,wilson,,relationship organ control write organ control...
4.0,system analysi univers librari final report re...,buckland,,establish nine new univers one thousand nine h...
5.0,librari manag game report research project,brophy,,although use game profession educ becom widesp...


In [43]:
processedDAta.to_csv('../../cisiData/cisiDataPreprocessed.csv')

#### query process

In [47]:
import pandas as pd
import re


def qTitlePreProcesse(t):
    tempText = toLower(t)
    tempText = removePunctuation(tempText)
    tempText = converteNumbers(tempText)
    tempText = removeWhiteSpace(tempText)
    tempText = removeStopWords(tempText)
    tempText = stemWords(tempText)
    tempText = lemmatizeWords(tempText)
    return tempText
    # return t

def qAbstractPreProcesse(a):
    tempText = toLower(a)
    tempText = removePunctuation(tempText)
    tempText = converteNumbers(tempText)
    tempText = removeWhiteSpace(tempText)
    tempText = removeStopWords(tempText)
    tempText = stemWords(tempText)
    tempText = lemmatizeWords(tempText)
    return tempText
    # return a

def qAuthorPreProcesse(a):
    tempText = toLower(a)
    lis = tempText.split(' ')
    names = ' '
    l = []
    for word in lis:
      if ',' in word:
          l.append(removePunctuation(word.replace(',','')))
    names = ' '.join(l)
    return names

def qPublicationPreProcesse(p):
    # tempText = p.replace('cisi ','')
    # return pd.to_datetime(tempText)
    return p


In [49]:
import pandas as pd
def preprocesseQuery(dataFrame:pd.DataFrame):
    pdataFrame = pd.DataFrame() 
    seriesDict:dict = {} 
    seriesData = seriesDict.copy()
    for i in dataFrame.index:
        try:
            templist = []
            tempT = tempA = tempW = tempB = None
            if not dataFrame.loc[i, '.T'] == '':
                # templist.append(qTitlePreProcesse(dataFrame.loc[i, '.T']))
                tempT = qTitlePreProcesse(dataFrame.loc[i, '.T'])
            if not dataFrame.loc[i, '.A'] == '':
                # templist.append(qAuthorPreProcesse(dataFrame.loc[i, '.A']))
                tempA = qAuthorPreProcesse(dataFrame.loc[i, '.A'])
            if not dataFrame.loc[i, '.W'] == '':
                # templist.append(qAbstractPreProcesse(dataFrame.loc[i, '.W']))
                tempW = qAbstractPreProcesse(dataFrame.loc[i, '.W'])
            if not dataFrame.loc[i, '.B'] == '':
                tempB = qPublicationPreProcesse(dataFrame.loc[i, '.B'])


            seriesData['.I'] = i
            # seriesData['data'] = ' '.join(templist)
            seriesData['.T'] = tempT
            seriesData['.A'] = tempA
            seriesData['.W'] = tempW
            seriesData['.B'] = tempB
            pdataFrame = pdataFrame.append(seriesData, ignore_index=True)
        except:
            print(i)
            raise 
    
    pdataFrame.set_index('.I', inplace=True)
    pdataFrame.fillna('', inplace=True)
    return pdataFrame


In [45]:
import pandas as pd
querydf = pd.read_csv('../../cisiData/cisiQueryCsv.csv')
querydf.set_index('.I', inplace=True)
querydf.fillna('', inplace=True)
querydf.head()

Unnamed: 0_level_0,.T,.A,.W,.B
.I,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,,,What problems and concerns are there in making...,
2,,,"How can actually pertinent data, as opposed to...",
3,,,What is information science? Give definitions...,
4,,,Image recognition and any other methods of aut...,
5,,,What special training will ordinary researcher...,


In [50]:
preprocessedQuery = preprocesseQuery(querydf)
preprocessedQuery.head()

Unnamed: 0_level_0,.T,.A,.W,.B
.I,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1.0,,,problem concern make descript titl difficulti ...,
2.0,,,actual pertin data oppos refer entir articl re...,
3.0,,,inform scienc give definit possibl,
4.0,,,imag recognit method automat transform print t...,
5.0,,,special train ordinari research businessmen ne...,


In [51]:
preprocessedQuery.to_csv('../../cisiData/cisiQueryPreprocessed.csv')

## indexing model

In [37]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import  cosine_similarity, linear_kernel

data = pd.read_csv('../../cisiData/cisiDataPreprocessed.csv')
data.fillna('', inplace=True)

# with open('../../../cisi/common_words', 'r') as f:
#     global commonWords
#     commonWords = f.read().split('\n')   

tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
tfidfTable = tfidf.fit_transform(data['data'])

def search(query,n:int):
    querytfidf = tfidf.transform([query])
    cos = cosine_similarity(querytfidf,tfidfTable).flatten()
    # print(cos)
    resultList = cos.argsort(axis=0)[-n:][::-1]
    # print(resultList)
    ls = []
    ids = []
    for i in resultList:
        ids.append(data.loc[i,'.I'])

    return ids


In [181]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import FeatureUnion, Pipeline

data = pd.read_csv('../../cisiData/cisiDataPreprocessed.csv')
data.fillna('', inplace=True)

transformer = FeatureUnion([
                ('title_tfidf', 
                  Pipeline([('extract_field',
                              FunctionTransformer(lambda x: x['.T'], 
                                                  validate=False)),
                            ('tfidf', 
                              TfidfVectorizer())])),
                ('author_tfidf', 
                  Pipeline([('extract_field', 
                              FunctionTransformer(lambda x: x['.A'], 
                                                  validate=False)),
                            ('tfidf', 
                              TfidfVectorizer())])),
                ('abstract_tfidf',
                 Pipeline([('extract_field',
                            FunctionTransformer(lambda x: x['.W'],
                                                  validate=False)),
                            ('tfidf',
                              TfidfVectorizer())]))])
tfidfTable = transformer.fit_transform(data)

In [182]:
tfidfTable.toarray()[1000:20000]

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.5978547 , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.51193249, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [173]:
def search(query,n:int):
    querytfidf = transformer.transform(query)
    cos = cosine_similarity(querytfidf,tfidfTable).flatten()
    # print(cos)
    resultList = cos.argsort(axis=0)[-n:][::-1]
    # print(resultList)
    ls = []
    ids = []
    for i in resultList:
        ids.append(data.loc[i,'.I'])

    return ids


search(pd.DataFrame(preprocessedQuery.loc[preprocessedQuery.index == 1,:]),10)

[429.0, 722.0, 589.0, 38.0, 603.0, 65.0, 1265.0, 813.0, 1299.0, 565.0]

In [98]:
tempdf = pd.DataFrame(preprocessedQuery.loc[preprocessedQuery.index == 1,:])
tempdf

Unnamed: 0_level_0,.T,.A,.W,.B
.I,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1.0,,,problem concern make descript titl difficulti ...,


In [183]:
queriesPath = '../../cisiData/cisiQueryPreprocessed.csv'
queriesData = pd.read_csv(queriesPath)


def queryingData(qDataFrame:pd.DataFrame, n):
    result = pd.DataFrame()
    resultDict:dict = {}
    resultDictCopy = resultDict.copy()
    for i in qDataFrame.index:
        try:

            tempList:list = search(pd.DataFrame(preprocessedQuery.loc[preprocessedQuery.index == i+1,:]), n)
            for id in range(1,n+1):
                resultDictCopy[str(id)] = tempList[id - 1]
            result = result.append(resultDictCopy, ignore_index=True)
            resultDictCopy = resultDict.copy()
        except:
            print(i)
            raise
    return result


In [184]:
queriesResult = queryingData(queriesData, 10)
queriesResult.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
0,722.0,429.0,589.0,603.0,1299.0,1281.0,38.0,813.0,620.0,836.0
1,1138.0,1155.0,565.0,532.0,1096.0,790.0,1136.0,58.0,562.0,309.0
2,469.0,1179.0,1181.0,1133.0,445.0,85.0,599.0,540.0,1077.0,803.0
3,179.0,790.0,175.0,77.0,1224.0,565.0,1120.0,72.0,315.0,498.0
4,1038.0,1105.0,1166.0,122.0,710.0,1361.0,862.0,459.0,1136.0,1106.0


## Evaluation

### precision

In [179]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score

def reSizeLists(l1:list, l2:list):
    if len(l1) < len(l2):
        l2 = l2[0:len(l1)]
    while len(l1) > len(l2):
        l1 = l1[0:len(l2)]

    return l1, l2, len(l1)


def precWithoutOrder(l1:list,l2:list):
    try:
        return len(set(l1).intersection(set(l2))) / len(l2)
    except:
        return 0

def calcPrecisionAtK(resData:pd.DataFrame, qresData: pd.DataFrame):
    precisionsAtK:list = []
    precisionAtK:float
    for i in resData.index:
        precisionOnQuery = []
        resArray = resData.loc[i].to_numpy()
        qresArray = qresData.loc[qresData['.I'] == i+1, 'data'].to_numpy()
        resArray, qresArray, lenth = reSizeLists(resArray, qresArray)
        if len(qresArray) == 0: 
            continue
        
        # for lenI in range(lenth-1,lenth):
        #     tempRes:list = resArray[0:lenI+1].tolist()
        #     tempQRes:list = qresArray[0:lenI+1].tolist()
        #     precisionOnQuery.append(precision_score(qresArray, resArray, average='weighted', zero_division=0))
            
            # if i == 0:
            #     print(tempRes)
            #     print(tempQRes)
            #     print(precisionOnQuery)
        precisionOnQuery.append(precision_score(qresArray, resArray, average='micro'))
   
        try:
            precisionsAtK.append(sum(precisionOnQuery) / len(precisionOnQuery))
        except ZeroDivisionError: 
            precisionsAtK.append(0)
    precisionAtK = sum(precisionsAtK) / len(precisionsAtK)
    # print(precisionAtK)
    return precisionAtK

In [111]:
import pandas as pd
qrelsFrame = pd.read_csv('../../cisiData/cisiQRels.csv')

In [185]:
calcPrecisionAtK(queriesResult, qrelsFrame)

0.015789473684210527

In [None]:
qrelsFrame.head()

In [None]:


def getMaxNumberOfGoldenStandard(queriesData):
    n:int = 0
    for i in queriesData.index:
        tempN = len(qrelsFrame.loc[qrelsFrame['.I'] == i+1, 'data'].to_numpy())
        if n < tempN:
            n = tempN
    return n

def calcCisiDataPrecision(queriesData, qresData):
    n:int = getMaxNumberOfGoldenStandard(queriesData)
    resArray = queryingData(queriesData, n)
    precision = calcPrecisionAtK(resArray, qresData)
    return precision
resArray = calcCisiDataPrecision(queriesData, qrelsFrame)
resArray


