# CACM DATASET

## distribute data

### data info

In [10]:
CACMDATA = '../../../../cacm/cacm.all'
import re
import pandas as pd

IDMarker = re.compile('(\.I.)')
allMarkers = re.compile('(\.[ITWBACKNX] )')

### queries info

In [6]:
CACMQUERY = '../../../../cacm/query.text'
CACMQRELS = '../../../../cacm/qrels.text'
import re
queryMarkers = re.compile('(\.[IWAN] )')

In [3]:
def getData(PATH, marker):
    """get the data from the file and split it by pattern"""
    with open(PATH, 'r') as f:
        t = f.read().replace('\n', ' ')
        lines = re.split(marker, t)
        lines.pop(0)
    return lines

### converte CACM.all

In [7]:
cacmData = getData(CACMDATA, allMarkers)

In [12]:
def distributeCacmData(cacmData):
    ''' after doing getdata method on cacm.all this method converte it to dataframe'''
    dataFrame = pd.DataFrame(columns=['.I','.T','.W','.B','.A','.K','.C','.N','.X'])
    seriesDict:dict = {
        '.I': None,
        '.T': None,
        '.W': None,
        '.B': None,
        '.K': None,
        '.C': None,
        '.A': None,
        '.N': None,
        '.X': None
    }
    seriesData = seriesDict.copy()
    notTheFirst = False
    for i in range(0, len(cacmData), 2):
        if (notTheFirst and cacmData[i].strip() == '.I'):
            dataFrame = dataFrame.append(seriesData, ignore_index=True)
            seriesData = seriesDict.copy()

        seriesData[cacmData[i].strip()] = cacmData[i+1].strip()
        notTheFirst = True
    dataFrame = dataFrame.append(seriesData, ignore_index=True)
    return dataFrame

In [13]:
cacmCsvDis = distributeCacmData(cacmData)
cacmCsvDis.head()
# .to_csv('../../cacmData/cacmCsv.csv')

Unnamed: 0,.I,.T,.W,.B,.A,.K,.C,.N,.X
0,1,Preliminary Report-International Algebraic Lan...,,"CACM December, 1958","Perlis, A. J. Samelson,K.",,,"CA581203 JB March 22, 1978 8:28 PM",100\t5\t1 123\t5\t1 164\t5\t1 1\t5\t1 1\t5\t1 ...
1,2,Extraction of Roots by Repeated Subtractions f...,,"CACM December, 1958","Sugai, I.",,,"CA581202 JB March 22, 1978 8:29 PM",2\t5\t2 2\t5\t2 2\t5\t2
2,3,Techniques Department on Matrix Program Schemes,,"CACM December, 1958","Friedman, M. D.",,,"CA581201 JB March 22, 1978 8:30 PM",3\t5\t3 3\t5\t3 3\t5\t3
3,4,Glossary of Computer Engineering and Programmi...,,"CACM November, 1958",,,,"CA581103 JB March 22, 1978 8:32 PM",4\t5\t4 4\t5\t4 4\t5\t4
4,5,Two Square-Root Approximations,,"CACM November, 1958","Wadey, W. G.",,,"CA581102 JB March 22, 1978 8:33 PM",5\t5\t5 5\t5\t5 5\t5\t5


### converte query.text

In [15]:
cacmQuery = getData(CACMQUERY, queryMarkers)

In [14]:
def distributeCacmQueries(cacmQuery):
    ''' after doing getdata method on cacm.qry this method converte it to dataframe'''
    qDataFrame = pd.DataFrame(columns=['.I','.T','.W','.A','.N'])
    seriesDict:dict = {
        '.I': None,
        '.T': None,
        '.W': None,
        '.A': None,
        '.N': None
    }
    seriesData = seriesDict.copy()
    notTheFirst = False
    for i in range(0, len(cacmQuery), 2):
        if (notTheFirst and cacmQuery[i].strip() == '.I'):
            qDataFrame = qDataFrame.append(seriesData, ignore_index=True)
            seriesData = seriesDict.copy()

        seriesData[cacmQuery[i].strip()] = cacmQuery[i+1].strip()
        notTheFirst = True
    qDataFrame = qDataFrame.append(seriesData, ignore_index=True)
    return qDataFrame

In [16]:
cacmQueryCsvDis = distributeCacmData(cacmQuery)
cacmQueryCsvDis.head()
# qDataFrame.to_csv('../../cacmData/cacmQueryCsv.csv')

Unnamed: 0,.I,.T,.W,.B,.A,.K,.C,.N,.X
0,1,,What articles exist which deal with TSS (Time ...,,,,,"1. Richard Alexander, Comp Serv, Langmuir Lab ...",
1,2,,I am interested in articles written either by ...,,"Prieve, B. Pooch, U.",,,"2. Richard Alexander, Comp Serv, Langmuir Lab ...",
2,3,,Intermediate languages used in construction of...,,,,,"3. Donna Bergmark, Comp Serv, Uris Hall (inter...",
3,4,,I'm interested in mechanisms for communicating...,,,,,4. Pavel Curtis (comm mech for disjoint proces...,
4,5,,I'd like papers on design and implementation o...,,,,,5. Pavel Curtis (editing interfaces),


### converte qrels.text

In [17]:
import pandas as pd

def getRles(path):
    with open(path, 'r') as f:
        global qrlesList
        qrlesList = f.read().split('\n')
        return qrlesList

qrelsData = getRles(CACMQRELS)
qrelsFrame = pd.DataFrame(columns=['.I', 'data'])
seriesDict:dict = {'.I':None, 'data':None}
seriesData = seriesDict.copy()
for i in qrelsData:
    try:
        element = i.split(' ')
        seriesData['.I'] = int(element[0])
        seriesData['data'] = int(element[1])
        qrelsFrame = qrelsFrame.append(seriesData, ignore_index=True)
    except:
        pass
qrelsFrame.head()

Unnamed: 0,.I,data
0,1,1410
1,1,1572
2,1,1605
3,1,2020
4,1,2358


In [30]:
qrelsFrame.to_csv('../../cacmData/cacmQRels.csv')

## clean preproccesing

### CACM

In [1]:
import pandas as pd
df:pd.DataFrame = pd.read_csv('../../cacmData/cacmCsv.csv', index_col='.I')
df.head()

Unnamed: 0_level_0,.T,.W,.B,.A,.K,.C,.N,.X
.I,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,Preliminary Report-International Algebraic Lan...,,"CACM December, 1958","Perlis, A. J. Samelson,K.",,,"CA581203 JB March 22, 1978 8:28 PM",100\t5\t1 123\t5\t1 164\t5\t1 1\t5\t1 1\t5\t1 ...
2,Extraction of Roots by Repeated Subtractions f...,,"CACM December, 1958","Sugai, I.",,,"CA581202 JB March 22, 1978 8:29 PM",2\t5\t2 2\t5\t2 2\t5\t2
3,Techniques Department on Matrix Program Schemes,,"CACM December, 1958","Friedman, M. D.",,,"CA581201 JB March 22, 1978 8:30 PM",3\t5\t3 3\t5\t3 3\t5\t3
4,Glossary of Computer Engineering and Programmi...,,"CACM November, 1958",,,,"CA581103 JB March 22, 1978 8:32 PM",4\t5\t4 4\t5\t4 4\t5\t4
5,Two Square-Root Approximations,,"CACM November, 1958","Wadey, W. G.",,,"CA581102 JB March 22, 1978 8:33 PM",5\t5\t5 5\t5\t5 5\t5\t5


In [None]:
df.describe()

In [None]:
df.info()

In [2]:
print(df.loc[:,'.T'].isnull().value_counts(), end='\n\n')
print(df.loc[:,'.W'].isnull().value_counts(), end='\n\n')
print(df.loc[:,'.A'].isnull().value_counts())

False    3203
True        1
Name: .T, dtype: int64

True     1617
False    1587
Name: .W, dtype: int64

False    3120
True       84
Name: .A, dtype: int64


In [3]:
df.fillna('', inplace=True)

In [None]:
df.duplicated().value_counts()

#### clean wrong values in .N

In [4]:
lis = df.loc[:,'.N'].to_list()
import re
lis = [s.replace(',',' ') for s in lis]
ids = []


pattern = r'[A-Z][a-z]+ +[0-9][0-9]? +[0-9]{4} +[0-9][0-9]?:[0-9][0-9]? +(PM|AM)'
for i in range(0,len(lis)):
    if re.search(pattern, lis[i]) == None:
        ids.append(i)


removePattern = r'CA[0-9]* ?[A-Z]{2} +[A-Z][a-z]+ +[0-9][0-9]? +[0-9]{4}'
for i in ids:
    lis[i] = re.findall(removePattern, lis[i])[0]

df.loc[:, '.N'] = lis

In [5]:
df.to_csv('../../cacmData/cacmDataCleaned.csv')

#### methods

In [19]:

def toLower(text):
    ''' Convert text to lower case'''
    return text.lower()



################################################################
import inflect
p = inflect.engine()

import re
reg = r'([0-9]+)'

def isFLoat(strNum):
    '''converte float number to word'''
    try:
        float(strNum)
        return True
    except:
        return False


def converteNumbers(text):
    ''' Convert texnumbers to words'''
    tempText = text.split()
    newText = []
    for word in tempText:
        tempList = re.split(reg,word)
        for miniWord in tempList:
            if miniWord.isdigit() or isFLoat(miniWord):
                temp = p.number_to_words(miniWord)
                newText.append(removePunctuation(temp))
            else:
                newText.append(miniWord)        
    tempText = ' '.join(newText)
    return tempText

################################################################

import string
translator = str.maketrans(string.punctuation,' '*len(string.punctuation))
def removePunctuation(text):
    ''' remove punctuation from text'''
    global translator
    return text.translate(translator)

################################################################

def removeWhiteSpace(text):
    '''remove whitespace from text'''
    return " ".join(text.split())

################################################################

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

def removeStopWords(text):
    ''' remove stopwords from text'''
    sw = set(stopwords.words("english"))
    wt = word_tokenize(text)
    filteredText = [word for word in wt if word not in sw]
    return ' '.join(filteredText)

################################################################

import numpy as np
def removeOutliers(tokens):
    '''remove outliers from wach document'''
    listOfTokens = tokens.split()
    fdist = FreqDist(word for word in listOfTokens)
    fdistKeys = np.array(list(fdist.keys()))
    freqOfWords = [fdist.freq(x) for x in fdistKeys]

    q1, q3 = np.percentile(freqOfWords, [25, 75])
    IQR = q3 - q1
    AVG = np.mean(freqOfWords)
    AvgRelValue = round(AVG * len(listOfTokens))
    Q1RelValue = round(q1* len(listOfTokens))
    Q3RelValue = round(q3 * len(listOfTokens))
    st = ' '.join(listOfTokens)

    for i in range(0,len(freqOfWords)):
        if freqOfWords[i] < q1 - 1.5*IQR:
            wordRelValue = round(freqOfWords[i] * len(listOfTokens))
            sub = Q1RelValue - wordRelValue
            word = fdistKeys[i]
            stForAppend = (' '+word+' ') * sub
            st = st + stForAppend

        if freqOfWords[i] > q3 + 1.5*IQR:
            wordRelValue = round(freqOfWords[i] * len(listOfTokens))
            sub = wordRelValue - Q3RelValue
            word = fdistKeys[i]
            st = st.replace(word, '', sub)
    
    return removeWhiteSpace(st)

################################################################

def addMostFreq(tokens):
    '''add words to empty title'''
    listOfTokens = tokens.split()
    fdist = FreqDist(word for word in listOfTokens)
    fdistKeys = np.array(list(fdist.keys()))
    freqOfWords = [fdist.freq(x) for x in fdistKeys]

    AVG = np.mean(freqOfWords)
    stlis = []

    for i in range(0,len(freqOfWords)):
        if freqOfWords[i] > AVG:
            stlis.append(fdistKeys[i])

    
    return removeWhiteSpace(' '.join(stlis))

################################################################

from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize

stemmer = PorterStemmer()
def stemWords(text):
    ''' stemm words'''
    global stemmer
    wt = word_tokenize(text)
    stems = []
    for word in wt:
        temp = stemmer.stem(word)
        stems.append(temp)
    return ' '.join(stems)

################################################################

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet as wn
from nltk import pos_tag, defaultdict

lemmatizer = WordNetLemmatizer()

tag_map = defaultdict(lambda: wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

def lemmatizeWords(text):
    ''' lemmatize words'''
    tokens = word_tokenize(text)
    lmtzr = WordNetLemmatizer()
    lemmas = [lmtzr.lemmatize(token, tag_map[tag[0]]) for token, tag in pos_tag(tokens) ]
    return ' '.join(lemmas)

################################################################

from nltk.metrics.distance import jaccard_distance

from nltk.util import ngrams
from nltk.corpus import words

correct_words = words.words()
result = []

def correctWords(text):
    for word in text.split():
        try:
            temp = [(jaccard_distance(set(ngrams(word, 2)),
                                      set(ngrams(w, 2))),w)
                                      for w in correct_words if w[0] == word[0]]
            result.append(sorted(temp, key = lambda val:val[0])[0][1])
        except:
            pass

    return ' '.join(result)

################################################################

#### cacm process

In [20]:

def TitlePreProcesse(t):
    '''do preprocess methods on titles'''
    tempText = toLower(t)
    tempText = removePunctuation(tempText)
    tempText = converteNumbers(tempText)
    tempText = removeWhiteSpace(tempText)
    tempText = removeStopWords(tempText)
    tempText = stemWords(tempText)
    tempText = lemmatizeWords(tempText)

    return tempText

########################################################################

def abstractPreProcesse(a):
    '''do preprocess methods on abstract'''
    tempText = toLower(a)
    tempText = removePunctuation(tempText)
    tempText = converteNumbers(tempText)
    tempText = removeWhiteSpace(tempText)
    tempText = removeStopWords(tempText)
    tempText = stemWords(tempText)
    tempText = lemmatizeWords(tempText)
    tempText = removeOutliers(tempText)

    return tempText

########################################################################

def authorPreProcesse(a):
    '''do preprocess methods on authors'''
    tempText = toLower(a)
    lis = tempText.split(' ')
    names = ' '
    l = []
    for word in lis:
      if ',' in word:
          l.append(removePunctuation(word))
    names = ' '.join(l)

    return names

########################################################################

def publicationPreProcesse(p):
    try:
        tempText = p.replace('CACM ','')
        return pd.to_datetime(tempText)
    except:
        return ''

########################################################################

def preprocessedCacmData(dataFrame:pd.DataFrame):
    '''take pandas dataFrame with coulmns = {.I, .T, .A, .W, .B} which contain Cacm data and preprocess it'''
    pdataFrame = pd.DataFrame()
    seriesDict:dict = {} 
    seriesData = seriesDict.copy()
    for i in dataFrame.index:
        try:
            tempT = tempA = tempW  =tempB = None
            if not dataFrame.loc[i, '.T'] == '':
                tempT = TitlePreProcesse(dataFrame.loc[i, '.T'])
            if not dataFrame.loc[i, '.A'] == '':
                tempA = authorPreProcesse(dataFrame.loc[i, '.A'])
            if not dataFrame.loc[i, '.W'] == '':
                tempW = abstractPreProcesse(dataFrame.loc[i, '.W'])
            if not dataFrame.loc[i, '.B'] == '':
                tempB = publicationPreProcesse(dataFrame.loc[i, '.B'])



            seriesData['.I'] = i+1
            seriesData['.T'] = tempT
            seriesData['.A'] = tempA
            seriesData['.W'] = tempW
            seriesData['.B'] = tempB
            pdataFrame = pdataFrame.append(seriesData, ignore_index=True)
        except:
            print(i)
            raise 
    
    pdataFrame.fillna('', inplace=True)
    return pdataFrame


In [None]:
# import pandas as pd
# data = pd.read_csv('../../cacmData/cacmDataCleaned.csv')
# data.fillna('', inplace=True)
# data.head()

In [21]:
cacmCsvDis.fillna('', inplace=True)
processedDAta = preprocessedCacmData(cacmCsvDis)
processedDAta.head()

Unnamed: 0,.I,.T,.A,.W,.B
0,1.0,preliminari report intern algebra languag,perlis samelson k,,1958-12-01
1,2.0,extract root repeat subtract digit comput,sugai,,1958-12-01
2,3.0,techniqu depart matrix program scheme,friedman,,1958-12-01
3,4.0,glossari comput engin program terminolog,,,1958-11-01
4,5.0,two squar root approxim,wadey,,1958-11-01


In [22]:
processedDAta.to_csv('../../cacmData/cacmDataPre.csv')

#### query process

In [23]:

def qTitlePreProcesse(t):
    '''do preprocess methods on query titles'''
    tempText = toLower(t)
    tempText = removePunctuation(tempText)
    tempText = converteNumbers(tempText)
    tempText = removeWhiteSpace(tempText)
    tempText = removeStopWords(tempText)
    tempText = stemWords(tempText)
    tempText = lemmatizeWords(tempText)

    return tempText

########################################################################

def qAbstractPreProcesse(a):
    '''do preprocess methods on query abstract'''
    tempText = toLower(a)
    tempText = removePunctuation(tempText)
    tempText = converteNumbers(tempText)
    tempText = removeWhiteSpace(tempText)
    tempText = removeStopWords(tempText)
    tempText = stemWords(tempText)
    tempText = lemmatizeWords(tempText)
    tempText = removeOutliers(tempText)
    return tempText

########################################################################

def qAuthorPreProcesse(a):
    '''do preprocess methods on query authors'''
    tempText = toLower(a)
    lis = tempText.split(' ')
    names = ' '
    l = []
    for word in lis:
      if ',' in word:
          l.append(removePunctuation(word))
    names = ' '.join(l)
    return names

########################################################################

def preprocesseQuery(dataFrame:pd.DataFrame):
    pdataFrame = pd.DataFrame() 
    seriesDict:dict = {} 
    seriesData = seriesDict.copy()
    for i in dataFrame.index:
        try:
            tempT = tempA = tempW  = None
            if not dataFrame.loc[i, '.T'] == '':
                tempT = qTitlePreProcesse(dataFrame.loc[i, '.T'])
            if not dataFrame.loc[i, '.A'] == '':
                tempA = qAuthorPreProcesse(dataFrame.loc[i, '.A'])
            if not dataFrame.loc[i, '.W'] == '':
                tempW = qAbstractPreProcesse(dataFrame.loc[i, '.W'])
                if dataFrame.loc[i, '.T'] == '':
                    tempT = addMostFreq(tempW)
                    

                
            seriesData['.I'] = i+1
            seriesData['.T'] = tempT
            seriesData['.A'] = tempA
            seriesData['.W'] = tempW
            pdataFrame = pdataFrame.append(seriesData, ignore_index=True)
        except:
            print(i)
            raise 
    
    pdataFrame.fillna('', inplace=True)
    return pdataFrame



In [None]:
# import pandas as pd
# querydf = pd.read_csv('../../cacmData/cacmQueryCsv.csv')
# querydf.set_index('.I', inplace=True)
# querydf.fillna('', inplace=True)
# querydf.head()

In [24]:
cacmQueryCsvDis.fillna('', inplace=True)
preprocessedQuery = preprocesseQuery(cacmQueryCsvDis)
preprocessedQuery.head()

Unnamed: 0,.I,.T,.A,.W
0,1.0,,,articl exist deal tss time share oper system i...
1,2.0,interest articl write either priev udo pooch,prieve pooch,interest articl write either priev udo pooch
2,3.0,,,intermedi languag use construct multi target c...
3,4.0,commun disjoint process possibl exclus distrib...,,commun disjoint process possibl exclus distrib...
4,5.0,,,like paper implement edit window manag command...


In [25]:
preprocessedQuery.to_csv('../../cacmData/cacmQueryPre.csv')

## index model

In [114]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import  cosine_similarity
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import FeatureUnion, Pipeline


########################################################################

transformer = None
tfidfTable  = None
def initializeTfidfTable(data: pd.DataFrame):
    ''' put cacm data in pipelinethen fit and transform it and return tfidf Table'''
    global transformer, tfidfTable
    transformer = FeatureUnion([
                      ('title_tfidf', 
                      Pipeline([
                        ('extract_field',
                                  FunctionTransformer(lambda x: x['.T'], 
                                                      validate=False)),
                                ('tfidf', 
                                  TfidfVectorizer(norm='l2' ,ngram_range=(1,2)))]))                              
                      ,('abstract_tfidf',
                     Pipeline([('extract_field',
                                FunctionTransformer(lambda x: x['.W'],
                                                      validate=False)),
                                ('tfidf',
                                  TfidfVectorizer(norm='l1',ngram_range=(1,2)))]))
                    ,('author_tfidf', 
                      Pipeline([('extract_field', 
                                  FunctionTransformer(lambda x: x['.A'], 
                                                      validate=False)),
                                ('tfidf', 
                                  TfidfVectorizer(norm='l1'))]))
    ])
    tfidfTable = transformer.fit_transform(data)

########################################################################

def getSimilars(query):
    ''' get the most n similar documents'''
    global transformer, tfidfTable
    querytfidf = transformer.transform(query)

    return cosine_similarity(querytfidf,tfidfTable).flatten()
    

########################################################################

def queryingData(qDataFrame:pd.DataFrame,data:pd.DataFrame, n):
    ''' search for all queries in the queries file and get the most n similar document .I'''
    result = pd.DataFrame()
    for i in qDataFrame.index:
        try:
            resultDict:dict = {}
            tempIds:list = getSimilars(pd.DataFrame(qDataFrame.loc[qDataFrame.index == i,:]), n)

            tempList = []
            for id in tempIds:
                tempList.append(data.loc[id,'.I'])

            for id in range(1,n+1):
                resultDict[str(id)] = tempList[id - 1]
            result = result.append(resultDict, ignore_index=True)
        except:
            print(i)
            raise
    return result

########################################################################




In [153]:
def absSub(a,b):
    return abs(a-b)

def search(qDF:pd.DataFrame,data:pd.DataFrame, n):
    ''' search for input and return list of ids of the result'''
    try:
        resultlis = []
        similars = getSimilars(qDF)
        
        tempIds = similars.argsort(axis=0)[-n:][::-1]

        if not qDF.loc[0, '.B'] == '':
            tempFrame = pd.DataFrame()
            for id in tempIds:
                tempFrame = tempFrame.append(data.loc[id,['.I','.B']])

            tempFrame.sort_values(by=['.B']\
                ,key=lambda x: absSub(x, pd.to_datetime(qDF.loc[0, '.B']))\
                    , inplace=True)
            
            return tempFrame.loc[:, '.I'].to_list()

        for id in tempIds:
            resultlis.append(data.loc[id,'.I'])

        return resultlis
    except:
        raise

########################################################################

In [116]:
initializeTfidfTable(data=processedDAta)

## Evaluation

### precision

In [28]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score

def reSizeLists(l1:list, l2:list):
    '''resize lists to have the same len'''
    if len(l1) < len(l2):
        l2 = l2[0:len(l1)]
    while len(l1) > len(l2):
        l1 = l1[0:len(l2)]

    return l1, l2

########################################################################

def precWithoutOrder(l1:list,l2:list):
    ''' calculate precision witout orering'''
    try:
        return len(set(l1).intersection(set(l2))) / len(l2)
    except:
        return 0

########################################################################

def calcMAPrecisionAtK(resData:pd.DataFrame, qrelsData: pd.DataFrame):
    '''calcualte MAP (average precision on multiple queries)'''
    precisionsAtK:list = []
    precisionAtK:float

    for i in resData.index:
        precisionOnQuery = []
        resArray = resData.loc[i].to_numpy()
        qresArray = qrelsData.loc[qrelsData['.I'] == i+1, 'data'].to_numpy()
        
        if len(qresArray) == 0: 
            continue

        resArray, qresArray = reSizeLists(resArray, qresArray)

        for lenI in range(0,len(qresArray)):
            tempRes:list = resArray[0:lenI+1].tolist()
            tempQRes:list = qresArray[0:lenI+1].tolist()
            precisionOnQuery.append(precision_score(tempQRes, tempRes, average='micro'))
        
        try:
            precisionsAtK.append(sum(precisionOnQuery) / len(precisionOnQuery))
        except ZeroDivisionError: 
            precisionsAtK.append(0)

    precisionAtK = sum(precisionsAtK) / len(precisionsAtK)
    return precisionAtK



########################################################################

def calcAveragePrecisionAtK(resData:pd.DataFrame, qrelsData: pd.DataFrame):
    '''calcualte just Average Precision for one query or more'''
    precisionsAtK:list = []

    for i in resData.index:
        precisionOnQuery = []

        resArray = resData.loc[i].to_numpy()
        qresArray = qrelsData.loc[qrelsData['.I'] == i+1, 'data'].to_numpy()
        
        if len(qresArray) == 0: 
            continue

        resArray, qresArray = reSizeLists(resArray, qresArray)

        for lenI in range(0,len(qresArray)):

            tempRes:list = resArray[0:lenI+1].tolist()
            tempQRes:list = qresArray[0:lenI+1].tolist()
            precisionOnQuery.append(precision_score(tempQRes, tempRes, average='micro'))

        try:
            precisionsAtK.append(sum(precisionOnQuery) / len(precisionOnQuery))
        except ZeroDivisionError: 
            precisionsAtK.append(0)
    return precisionsAtK

########################################################################


def calcPrecisionAtK(resData:pd.DataFrame, qrelsData: pd.DataFrame):
    '''calcualte just Precision for one query or more'''
    precisionsAtK:list = []

    for i in resData.index:
        resArray = resData.loc[i].to_numpy()
        qresArray = qrelsData.loc[qrelsData['.I'] == i+1, 'data'].to_numpy()
        
        if len(qresArray) == 0: 
            continue

        resArray, qresArray = reSizeLists(resArray, qresArray)
        precisionsAtK.append(precision_score(qresArray, resArray, average='micro'))

    return sum(precisionsAtK) / len(precisionsAtK)

########################################################################

def calcRecallAtK(resData:pd.DataFrame, qrelsData: pd.DataFrame):
    '''calcualte just recall for one query or more'''
    recallsAtK:list = []

    for i in resData.index:

        resArray = resData.loc[i].to_numpy()
        qresArray = qrelsData.loc[qrelsData['.I'] == i+1, 'data'].to_numpy()
        
        if len(qresArray) == 0: 
            continue

        resArray, qresArray = reSizeLists(resArray, qresArray)
        recallsAtK.append(recall_score(qresArray, resArray, average='micro'))

    return sum(recallsAtK) / len(recallsAtK)

########################################################################

def calcMeanReciprocalRank(resData:pd.DataFrame, qrelsData: pd.DataFrame):

    resultDataFrame = pd.DataFrame(columns=['.I','data', 'rank'])


    for row in resData.index:
        temp = resData.loc[row].to_list()
        ke = []
        rank = []
        for i in range(0,len(temp)):
            ke.append(row + 1)
            rank.append(i+1)
        lot = zip(ke,temp, rank)
        tempdata = pd.DataFrame(lot, columns=['.I','data', 'rank'])

        resultDataFrame = resultDataFrame.append(tempdata, ignore_index=True)

    
    MAX_RANK = 100000

    hits = pd.merge(qrelsData, resultDataFrame,
        on=[".I", "data"],
        how="left").fillna(MAX_RANK)

    mrr = (1 / hits.groupby('.I')['rank'].min()).mean()

    return mrr














########################################################################

def calcMAPrecisionAtKOrder(resData:pd.DataFrame, qrelsData: pd.DataFrame):
    '''calcualte MAP (average precision on multiple queries) without order'''
    precisionsAtK:list = []
    precisionAtK:float

    for i in resData.index:
        resArray = resData.loc[i].to_numpy()
        qresArray = qrelsData.loc[qrelsData['.I'] == i+1, 'data'].to_numpy()
        
        if len(qresArray) == 0: 
            continue

        resArray, qresArray = reSizeLists(resArray, qresArray)

        prec = precWithoutOrder(qresArray, resArray)

        precisionsAtK.append(prec)

    precisionAtK = sum(precisionsAtK) / len(precisionsAtK)
    return precisionAtK




In [33]:
import pandas as pd
cacmQRELAfCl = pd.read_csv('../../cacmData/cacmQRels.csv')

In [34]:
n=10
dataPD = queryingData(preprocessedQuery, processedDAta, n)
precisionAtN = calcPrecisionAtK(dataPD, cacmQRELAfCl)
recallAtN = calcRecallAtK(dataPD, cacmQRELAfCl)
meanAveragePrecision = calcMAPrecisionAtK(dataPD, cacmQRELAfCl)
meanReciprocalRank = calcMeanReciprocalRank(dataPD, cacmQRELAfCl)

print(f'precacmon@{n} : {precisionAtN}')
print(f'recall@{n} : {recallAtN}')
print(f'MAP : {meanAveragePrecision}')
print(f'MRR : {meanReciprocalRank}')


precacmon@10 : 0.057692307692307696
recall@10 : 0.057692307692307696
MAP : 0.0665583028083028
MRR : 0.45747350122100117


In [154]:

def preprocesseSearchInput(dataDic) -> pd.DataFrame:
    psi = pd.DataFrame()
    seriesDict:dict = {} 

    data = dataDic.get('query')

    tempI = 1
    tempW = qAbstractPreProcesse(data)
    tempT = addMostFreq(tempW)
    tempA = ''
    try:
        tempB = pd.to_datetime(dataDic.get('date'))
    except:
        tempB = ''

    seriesDict['.I'] = tempI
    seriesDict['.T'] = tempT
    seriesDict['.A'] = tempA
    seriesDict['.W'] = tempW
    seriesDict['.B'] = tempB
   
    psi = psi.append(seriesDict, ignore_index=True)
    psi.fillna('', inplace=True)

    return psi

def resultToDict(resultDict, resultIds):
    for i in range(0,len(resultIds)):
        temp = processedDAta.loc[processedDAta['.I'] == resultIds[i],\
             ['.T', '.A', '.W', '.B']].to_dict()

        tk = list(temp.keys())
        for sk in tk:
            try:
                k = list(temp[sk].keys())
                temp[sk] = temp[sk][k[0]]
            except:
                continue

        resultDict['reslutDictionary']['result'][i] = temp

    return resultDict

    
def searchInput(data):
    dataPD: pd.DataFrame = preprocesseSearchInput(data)
    resultIds = search(dataPD, processedDAta, data.get('n'))
    resultDict = {
        'reslutDictionary':{
            'result':{},
        }
    }
    return resultToDict(resultDict, resultIds)

In [155]:
d = {
    "query":"information",
    "n":10,
    "date":""
}
searchInput(d)

{'reslutDictionary': {'result': {0: {'.T': 'subroutin assembl',
    '.A': 'samet ',
    '.W': 'descript give assembl system requir one pas maintain tabl inform subroutin librari',
    '.B': Timestamp('1965-01-01 00:00:00')},
   1: {'.T': 'american standard ifip icc vocabulari compar',
    '.A': 'traub ',
    '.W': 'propos american standard ifip icc vocabulari term use inform process analyz compar',
    '.B': Timestamp('1965-06-01 00:00:00')},
   2: {'.T': 'convent use symbol prepar flowchart inform process system standard work paper',
    '.A': '',
    '.W': 'paper intend outlin variou consid inform process system convent appli use appear propos american standard flowchart symbol per se',
    '.B': Timestamp('1965-07-01 00:00:00')},
   3: {'.T': 'protect inform process util',
    '.A': 'graham ',
    '.W': 'critic design process util permit flexibl share user inform privaci one solut problem discus',
    '.B': Timestamp('1968-05-01 00:00:00')},
   4: {'.T': 'perform system use data tra

In [156]:
d = {
    "query":"information",
    "n":10,
    "date":"1965-06-01"
}
searchInput(d)

{'reslutDictionary': {'result': {0: {'.T': 'american standard ifip icc vocabulari compar',
    '.A': 'traub ',
    '.W': 'propos american standard ifip icc vocabulari term use inform process analyz compar',
    '.B': Timestamp('1965-06-01 00:00:00')},
   1: {'.T': 'convent use symbol prepar flowchart inform process system standard work paper',
    '.A': '',
    '.W': 'paper intend outlin variou consid inform process system convent appli use appear propos american standard flowchart symbol per se',
    '.B': Timestamp('1965-07-01 00:00:00')},
   2: {'.T': 'perform system use data transmiss transfer rate inform bit asa tutori standard',
    '.A': '',
    '.W': 'thruput characterist system perform discuss discus includ pertin aspect transfer determin transfer bit trib residu error standard measur condit paper also present orderli arrang characterist paramet affect inform thruput exampl procedur determin thruput term trib conclud perform characterist involv inform rate best express trib co

In [152]:
tempFrame = pd.DataFrame()
tempFrame = tempFrame.append(processedDAta.loc[processedDAta['.I'] == 1, :])
pd.to_datetime(tempFrame.loc[0, '.B'])


Timestamp('1958-12-01 00:00:00')