## distribute data

### data info

In [None]:
CISIDATA = '../../../../CISI/CISI.ALL'
import re
IDMarker = re.compile('(\.I.)')
allMarkers = re.compile('(\.[ITABWX] )')

### queries info

In [None]:
CISIQUERY = '../../../../CISI/CISI.QRY'
CISIQRELS = '../../../../CISI/CISI.REL'
import re
queryMarkers = re.compile('(\.[ITAWB] )')

In [None]:
def getData(PATH, marker):
    """get the data from the file and split it by ID"""
    with open(PATH, 'r') as f:
        t = f.read().replace('\n', ' ')
        lines = re.split(marker, t)
        lines.pop(0)
    return lines

### converte CISI.ALL

In [None]:
cisiData = getData(CISIDATA, allMarkers)

In [None]:
import pandas as pd
dataFrame = pd.DataFrame(columns=['.I','.T','.A','.B','.W','.X'])
seriesDict:dict = {
    '.I': None,
    '.T': None,
    '.A': None,
    '.B': None,
    '.W': None,
    '.X': None
}
seriesData = seriesDict.copy()
notTheFirst = False
for i in range(0, len(cisiData), 2):
    if (notTheFirst and cisiData[i].strip() == '.I'):
        dataFrame = dataFrame.append(seriesData, ignore_index=True)
        seriesData = seriesDict.copy()
    
    seriesData[cisiData[i].strip()] = cisiData[i+1].strip()
    notTheFirst = True
dataFrame = dataFrame.append(seriesData, ignore_index=True)
dataFrame.head()

In [None]:
dataFrame.to_csv('../../cisiData/cisiCsv.csv')

### converte query.text

In [None]:
cisiQuery = getData(CISIQUERY, queryMarkers)

In [None]:
import pandas as pd
qDataFrame = pd.DataFrame(columns=['.I','.T','.A','.W','.B'])
seriesDict:dict = {
    '.I': None,
    '.T': None,
    '.A': None,
    '.W': None,
    '.B': None
}
seriesData = seriesDict.copy()
notTheFirst = False
for i in range(0, len(cisiQuery), 2):
    if (notTheFirst and cisiQuery[i].strip() == '.I'):
        qDataFrame = qDataFrame.append(seriesData, ignore_index=True)
        seriesData = seriesDict.copy()
    
    seriesData[cisiQuery[i].strip()] = cisiQuery[i+1].strip()
    notTheFirst = True
qDataFrame = qDataFrame.append(seriesData, ignore_index=True)
qDataFrame.head()

In [None]:
qDataFrame.to_csv('../../cisiData/cisiQueryCsv.csv')

### converte qrels.text

In [None]:
import pandas as pd

def getRles(path):
    with open(path, 'r') as f:
        global qrlesList
        qrlesList = f.read().split('\n')
        return qrlesList

qrelsData = getRles(CISIQRELS)
qrelsFrame = pd.DataFrame(columns=['.I', 'data'])
seriesDict:dict = {'.I':None, 'data':None}
seriesData = seriesDict.copy()
for i in qrelsData:
    try:
        element = i.split()
        seriesData['.I'] = int(element[0])
        seriesData['data'] = int(element[1])
        qrelsFrame = qrelsFrame.append(seriesData, ignore_index=True)
    except:
        pass
qrelsFrame.head()

In [None]:
qrelsFrame.to_csv('../../cisiData/cisiQRels.csv')

## clean preproccesing

### CISI

In [None]:
import pandas as pd
df:pd.DataFrame = pd.read_csv('../../cisiData/cisiCsv.csv', index_col=[0])
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
print(df.loc[:,'.T'].isnull().value_counts(), end='\n\n')
print(df.loc[:,'.W'].isnull().value_counts(), end='\n\n')
print(df.loc[:,'.A'].isnull().value_counts())

In [None]:
df.fillna('', inplace=True)

In [None]:
df.duplicated().value_counts()

In [None]:
df.to_csv('../../cisiData/cisiDataCleaned.csv')

#### methods

##### lowercase

In [None]:
def toLower(text):
    return text.lower()

##### Numbers to words

In [None]:
import inflect
p = inflect.engine()

import re
reg = r'([0-9]+)'

def isFLoat(strNum):
    try:
        float(strNum)
        return True
    except:
        return False


def converteNumbers(text):
    tempText = text.split()
    newText = []
    for word in tempText:
        tempList = re.split(reg,word)
        for miniWord in tempList:
            if miniWord.isdigit() or isFLoat(miniWord):
                temp = p.number_to_words(miniWord)
                newText.append(removePunctuation(temp))
            else:
                newText.append(miniWord)        
    tempText = ' '.join(newText)
    return tempText
    # return text

##### remove punctuation

In [None]:
import string
translator = str.maketrans(string.punctuation,' '*len(string.punctuation))
def removePunctuation(text):
    global translator
    return text.translate(translator)


##### remove whitespaces

In [None]:
def removeWhiteSpace(text):
    return " ".join(text.split())

##### remove stop words

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist



def removeStopWords(text):
    sw = set(stopwords.words("english"))
    wt = word_tokenize(text)
    filteredText = [word for word in wt if word not in sw]
    return ' '.join(filteredText)
    # return text

##### calculate Frequency

In [None]:
import numpy as np
def calcFreq(tokens):
    listOfTokens = tokens.split()
    fdist = FreqDist(word for word in listOfTokens)
    fdistKeys = np.array(list(fdist.keys()))
    freqOfWords = [fdist.freq(x) for x in fdistKeys]

    q1, q3 = np.percentile(freqOfWords, [25, 75])
    # q3 = np.percentile(freqOfWords, 75, interpolation='midpoint')
    IQR = q3 - q1
    AVG = np.mean(freqOfWords)
    AvgRelValue = round(AVG * len(listOfTokens))
    Q1RelValue = round(q1* len(listOfTokens))
    Q3RelValue = round(q3 * len(listOfTokens))
    st = ' '.join(listOfTokens)

    for i in range(0,len(freqOfWords)):
        if freqOfWords[i] < q1 - 1.5*IQR:
            wordRelValue = round(freqOfWords[i] * len(listOfTokens))
            sub = Q1RelValue - wordRelValue
            word = fdistKeys[i]
            stForAppend = word * sub
            st = st + stForAppend

        if freqOfWords[i] > q3 + 1.5*IQR:
            wordRelValue = round(freqOfWords[i] * len(listOfTokens))
            sub = wordRelValue - Q3RelValue
            word = fdistKeys[i]
            st = st.replace(word, '', sub)
    
    return removeWhiteSpace(st)



In [None]:
st = '''note pseudo mathemat relev taube recent number articl book report deal inform system e document retriev system advanc doctrin system evalu term degre percentag relev provid although seem littl agreement relev mean doubt quantifi nevertheless grow agreement fix formal relationship exist relev recal perform system thu find literatur frankli subject notion relev report individu user equat curv mathemat formul presum provid numer measur recal relev characterist inform system phenomenon shift back forth admittedli subject non mathemat term equat term give mathemat valu mathemat definit ancient parallel discus probabl one cours legisl mean term depend alic point master user term hand use singl term document cover two distinct mean especi usag design secur accept doctrin attribut mathemat valid repres seriou situat mere careless ambigu'''
qt = '''problem concern make descript titl difficulti involv automat retriev articl approxim titl usual relev content articl titl'''
calcFreq(st)

In [None]:
# import numpy as np


# def checkOutLiers(lineOfPercent):
#     try:
#         q1 = np.percentile(lineOfPercent, 25, interpolation='midpoint')
#         q3 = np.percentile(lineOfPercent, 75, interpolation='midpoint')
#         IQR = q3 - q1
#         AVG = np.mean(lineOfPercent)
#         lineAfterRemoveOutLiers = []
#         for i in lineOfPercent:
#             if i < q1 - 1.5*IQR:
#                 lineAfterRemoveOutLiers.append(AVG)
#             elif i > q3 + 1.5*IQR:
#               lineAfterRemoveOutLiers.append(AVG)
#             else:
#              lineAfterRemoveOutLiers.append(i)

#         return lineAfterRemoveOutLiers
#     except:
#         return lineOfPercent



# def checkCoverage(queryTokens) -> list:
#     covDoc = []

#     qTokens = np.array(queryTokens)
#     for doc in wordsFreq:
#         WordsList = np.array(list(doc.keys()))
#         common = np.intersect1d(qTokens, WordsList)
#         freqOfWords = [doc.freq(x) for x in common]
#         common = checkOutLiers(freqOfWords)
#         try:
#             covDoc.append(sum(common))
#         except:
#             covDoc.append(0)
    
#     sortedList = np.array(covDoc).argsort(axis=0)[::-1]
#     return sortedList

# # checkCoverage(queryTokens=['saher', 'fatima', 'man', 'hello'])

##### stemming

In [None]:
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize

stemmer = PorterStemmer()
def stemWords(text):
    global stemmer
    wt = word_tokenize(text)
    stems = []
    for word in wt:
        temp = stemmer.stem(word)
        # if not temp == word:
        #     temp = correctWords(temp)
        stems.append(temp)
    return ' '.join(stems)
    # return text

##### lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet as wn
from nltk import pos_tag, defaultdict

lemmatizer = WordNetLemmatizer()

tag_map = defaultdict(lambda: wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV




def lemmatizeWords(text):
    # wt = word_tokenize(text)
    # lemmas = [lemmatizer.lemmatize(word, pos='a') for word in wt]
    # return ' '.join(lemmas)
    # return text

    tokens = word_tokenize(text)
    lmtzr = WordNetLemmatizer()
    lemmas = [lmtzr.lemmatize(token, tag_map[tag[0]]) for token, tag in pos_tag(tokens) ]
    return ' '.join(lemmas)
# lemmatizeWords('hard')

##### correcting

In [None]:
from nltk.metrics.distance import jaccard_distance
from nltk.util import ngrams
from nltk.corpus import words
correct_words = words.words()
incorrectWords = '''preliminari'''.split()
result = []
def correctWords(text):
    for word in text:
        try:
            temp = [(jaccard_distance(set(ngrams(word, 2)),
                                      set(ngrams(w, 2))),w)
                                      for w in correct_words if w[0] == word[0]]
            result.append(sorted(temp, key = lambda val:val[0])[0][1])
        except:
            pass
    return ' '.join(result)
# correctWords(incorrectWords)

#### cisi process

In [None]:
import pandas as pd
import re

def TitlePreProcesse(t):
    tempText = toLower(t)
    tempText = removePunctuation(tempText)
    tempText = converteNumbers(tempText)
    tempText = removeWhiteSpace(tempText)
    tempText = removeStopWords(tempText)
    tempText = stemWords(tempText)
    tempText = lemmatizeWords(tempText)

    return tempText

def abstractPreProcesse(a):
    tempText = toLower(a)
    tempText = removePunctuation(tempText)
    tempText = converteNumbers(tempText)
    tempText = removeWhiteSpace(tempText)
    tempText = removeStopWords(tempText)
    tempText = stemWords(tempText)
    tempText = lemmatizeWords(tempText)

    return tempText

# i didn't do it yet on cisi (converte date to timestamp)
def publicationPreProcesse(p): 
    # tempText = p.replace('cisi ','')
    # return pd.to_datetime(tempText)
    return p
    
def authorPreProcesse(a):
    tempText = toLower(a)
    lis = tempText.split(' ')
    names = ' '
    l = []
    for word in lis:
      if ',' in word:
          l.append(removePunctuation(word))
    names = ' '.join(l)
    return names


In [None]:
import pandas as pd
def preprocessedData(dataFrame:pd.DataFrame):
    pdataFrame = pd.DataFrame()
    seriesDict:dict = {} 
    for i in dataFrame.index:
        try:
            tempT = tempA = tempW = ''
            tempB = None
            if not dataFrame.loc[i, '.T'] == '':
                tempT = TitlePreProcesse(dataFrame.loc[i, '.T'])
            if not dataFrame.loc[i, '.A'] == '':
                tempA = authorPreProcesse(dataFrame.loc[i, '.A'])
            if not dataFrame.loc[i, '.B'] == '':
                tempB = publicationPreProcesse(dataFrame.loc[i, '.B'])
            if not dataFrame.loc[i, '.W'] == '':
                tempW = abstractPreProcesse(dataFrame.loc[i, '.W'])

            seriesDict['.I'] = i+1
            seriesDict['data'] = calcFreq(' '.join([tempT, tempA, tempW]))
            seriesDict['.B'] = tempB
            
            pdataFrame = pdataFrame.append(seriesDict, ignore_index=True)
            seriesDict = {}
        except:
            print(i)
            raise 
    pdataFrame.fillna('', inplace=True)
    return pdataFrame


In [None]:
import pandas as pd
data = pd.read_csv('../../cisiData/cisiDataCleaned.csv', index_col=[0])
data.fillna('', inplace=True)
data.head()

In [None]:
processedDAta = preprocessedData(data)
processedDAta.head()

In [None]:
processedDAta.to_csv('../../cisiData/cisiDataPreprocessedV2.csv')

#### query process

In [None]:
import pandas as pd
import re


def qTitlePreProcesse(t):
    tempText = toLower(t)
    tempText = removePunctuation(tempText)
    tempText = converteNumbers(tempText)
    tempText = removeWhiteSpace(tempText)
    tempText = removeStopWords(tempText)
    tempText = stemWords(tempText)
    tempText = lemmatizeWords(tempText)

    return tempText

def qAbstractPreProcesse(a):
    tempText = toLower(a)
    tempText = removePunctuation(tempText)
    tempText = converteNumbers(tempText)
    tempText = removeWhiteSpace(tempText)
    tempText = removeStopWords(tempText)
    tempText = stemWords(tempText)
    tempText = lemmatizeWords(tempText)
    return tempText

def qAuthorPreProcesse(a):
    tempText = toLower(a)
    lis = tempText.split(' ')
    names = ' '
    l = []
    for word in lis:
      if ',' in word:
          l.append(removePunctuation(word))
    names = ' '.join(l)
    return names


# i didn't do it yet on cisi (converte date to timestamp)
def qPublicationPreProcesse(p):
    # tempText = p.replace('cisi ','')
    # return pd.to_datetime(tempText)
    return p


In [None]:
import pandas as pd
def preprocesseQuery(dataFrame:pd.DataFrame):
    qdataFrame = pd.DataFrame()
    seriesDict:dict = {} 
    for i in dataFrame.index:
        try:
            tempT = tempA = tempW = ''
            tempB = None
            if not dataFrame.loc[i, '.T'] == '':
                tempT = TitlePreProcesse(dataFrame.loc[i, '.T'])
            if not dataFrame.loc[i, '.A'] == '':
                tempA = authorPreProcesse(dataFrame.loc[i, '.A'])
            if not dataFrame.loc[i, '.B'] == '':
                tempB = publicationPreProcesse(dataFrame.loc[i, '.B'])
            if not dataFrame.loc[i, '.W'] == '':
                tempW = abstractPreProcesse(dataFrame.loc[i, '.W'])

            seriesDict['.I'] = i+1
            seriesDict['data'] = calcFreq(' '.join([tempT, tempA, tempW]))
            seriesDict['.B'] = tempB
            
            qdataFrame = qdataFrame.append(seriesDict, ignore_index=True)
            seriesDict = {}
        except:
            print(i)
            raise 
    qdataFrame.fillna('', inplace=True)
    return qdataFrame


In [None]:
import pandas as pd
querydf = pd.read_csv('../../cisiData/cisiQueryCsv.csv', index_col=[0])
querydf.fillna('', inplace=True)
querydf.head()

In [None]:
preprocessedQuery = preprocesseQuery(querydf)
preprocessedQuery.head()

In [None]:
preprocessedQuery.to_csv('../../cisiData/cisiQueryPreprocessedV2.csv')

## indexing model

In [2]:
import pandas as pd
data = pd.read_csv('../../cisiData/cisiDataPreprocessedV2.csv', index_col=[0])
data.fillna('', inplace=True)
data.head()

Unnamed: 0,.I,data,.B
0,1.0,eighteen comaromi present studi decim classif ...,
1,2.0,make slater report analysi six thousand three ...,
2,3.0,two kind essay wilson relationship organ organ...,
3,4.0,system analysi final project buckland new one ...,
4,5.0,librari report research project brophy althoug...,


In [3]:
#
# this is old cell for build index model with merge coulmns into one coulmn (not work now)
#

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import  cosine_similarity, linear_kernel


# tfidf = TfidfVectorizer()
# tfidfTable = tfidf.fit_transform(data['.W'])


In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import FeatureUnion, Pipeline



# transformer = FeatureUnion([
#                 # ('title_tfidf', 
#                 #   Pipeline([
#                 #     ('extract_field',
#                 #               FunctionTransformer(lambda x: x['.T'], 
#                 #                                   validate=False)),
#                 #             ('tfidf', 
#                 #               TfidfVectorizer(norm='l1'))])),
#                 # ('author_tfidf', 
#                 #   Pipeline([('extract_field', 
#                 #               FunctionTransformer(lambda x: x['.A'], 
#                 #                                   validate=False)),
#                 #             ('tfidf', 
#                 #               TfidfVectorizer(norm='l1'))])),
#                 ('abstract_tfidf',
#                  Pipeline([('extract_field',
#                             FunctionTransformer(lambda x: x['.W'],
#                                                   validate=False)),
#                             ('tfidf',
#                               TfidfVectorizer(norm='l2', ngram_range=(1,4)))]))])
# tfidfTable = transformer.fit_transform(data)
# tfidfTable


vectorizer = TfidfVectorizer(norm='l1', ngram_range=(1,2))
tfidfTable = vectorizer.fit_transform(data.data)
tfidfTable

<1460x76146 sparse matrix of type '<class 'numpy.float64'>'
	with 165781 stored elements in Compressed Sparse Row format>

In [5]:
tfidfTable.toarray()[1000:20000]

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.02350596, 0.01374093, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.0141537 , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [6]:
import pandas as pd
preprocessedQuery = pd.read_csv('../../cisiData/cisiQueryPreprocessedV2.csv', index_col=[0])
preprocessedQuery.fillna('', inplace=True)
preprocessedQuery.head()

Unnamed: 0,.I,data,.B
0,1.0,problem concern make descript difficulti invol...,
1,2.0,actual pertin data oppos refer entir articl re...,
2,3.0,inform scienc give definit possibl,
3,4.0,imag recognit method automat transform print t...,
4,5.0,special train ordinari research businessmen ne...,


In [7]:
import numpy as np
def search(query,n:int):
    querytfidf = vectorizer.transform(query)

    cos = cosine_similarity(querytfidf,tfidfTable).flatten()
    # mostCovered = checkCoverage(query['.W'].tolist()[0].split()) 
    # # print(mostCovered[)
    # tempResult = []
    # for i in range(0,10):
    #     cos[mostCovered[i]] = cos[mostCovered[i]] + 1 
    # # print(tempResult)
    # tempResNp = np.array(tempResult)
    resultList = cos.argsort(axis=0)[-n:][::-1]
    ids = []
    for i in resultList:
        ids.append(data.loc[i,'.I'])
        
    return ids
    return query['.W'].tolist()[0]

# ex query num 1 (too bad result)
search(preprocessedQuery.loc[preprocessedQuery.index == 0,'data'],10)

[882.0, 489.0, 785.0, 531.0, 666.0, 621.0, 175.0, 429.0, 323.0, 483.0]

In [8]:
def queryingData(qDataFrame:pd.DataFrame, n):

    result = pd.DataFrame()

    for i in qDataFrame.index:
        try:
            resultDict:dict = {}
            tempList:list = search(qDataFrame.loc[qDataFrame.index == i,'data'], n)
            for id in range(1,n+1):
                resultDict[str(id)] = tempList[id - 1]
            result = result.append(resultDict, ignore_index=True)
        except:
            print(i)
            raise
    return result


In [9]:
queriesPath = '../../cisiData/cisiQueryPreprocessedV2.csv'
queriesData = pd.read_csv(queriesPath, index_col=[0])
queriesData.fillna('', inplace=True)
queriesData.head()

Unnamed: 0,.I,data,.B
0,1.0,problem concern make descript difficulti invol...,
1,2.0,actual pertin data oppos refer entir articl re...,
2,3.0,inform scienc give definit possibl,
3,4.0,imag recognit method automat transform print t...,
4,5.0,special train ordinari research businessmen ne...,


In [10]:
queriesResult = queryingData(queriesData, 10)
queriesResult.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
0,882.0,489.0,785.0,531.0,666.0,621.0,175.0,429.0,323.0,483.0
1,1138.0,1327.0,483.0,451.0,175.0,1071.0,1236.0,565.0,797.0,1096.0
2,469.0,1118.0,60.0,803.0,652.0,553.0,1133.0,1179.0,1161.0,1181.0
3,565.0,175.0,890.0,1396.0,663.0,601.0,483.0,480.0,79.0,77.0
4,483.0,1282.0,1081.0,501.0,1166.0,1307.0,482.0,388.0,779.0,1092.0


## Evaluation

### precision

In [11]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score

def reSizeLists(l1:list, l2:list):
    '''resize lists to have the same len'''
    if len(l1) < len(l2):
        l2 = l2[0:len(l1)]
    while len(l1) > len(l2):
        l1 = l1[0:len(l2)]

    return l1, l2


def precWithoutOrder(l1:list,l2:list):
    ''' calculate precision witout orering'''
    try:
        return len(set(l1).intersection(set(l2))) / len(l2)
    except:
        return 0

def calcMAPrecisionAtK(resData:pd.DataFrame, qrelsData: pd.DataFrame):
    '''calcualte MAP'''
    precisionsAtK:list = []
    precisionAtK:float

    for i in resData.index:
        resArray = resData.loc[i].to_numpy()
        qresArray = qrelsData.loc[qrelsData['.I'] == i+1, 'data'].to_numpy()
        
        if len(qresArray) == 0: 
            continue

        resArray, qresArray = reSizeLists(resArray, qresArray)



        prec = precision_score(qresArray, resArray, average='micro')
        # prec = precWithoutOrder(qresArray, resArray)

        precisionsAtK.append(prec)

    precisionAtK = sum(precisionsAtK) / len(precisionsAtK)
    return precisionAtK

def calcAPrecisionAtK(resData:pd.DataFrame, qrelsData: pd.DataFrame):
    '''calcualte Average Precision'''
    precisionsAtK:list = []

    for i in resData.index:
        precisionOnQuery = []

        resArray = resData.loc[i].to_numpy()
        qresArray = qrelsData.loc[qrelsData['.I'] == i+1, 'data'].to_numpy()
        
        if len(qresArray) == 0: 
            continue
        resArray, qresArray = reSizeLists(resArray, qresArray)

        for lenI in range(0,len(qresArray)):

            tempRes:list = resArray[0:lenI+1].tolist()
            tempQRes:list = qresArray[0:lenI+1].tolist()
            precisionOnQuery.append(precision_score(tempQRes, tempRes, average='micro'))

        try:
            precisionsAtK.append(sum(precisionOnQuery) / len(precisionOnQuery))
        except ZeroDivisionError: 
            precisionsAtK.append(0)
    return precisionsAtK


In [12]:
import pandas as pd
qrelsFrame = pd.read_csv('../../cisiData/cisiQRels.csv', index_col=[0])
qrelsFrame.head()

Unnamed: 0,.I,data
0,1,28
1,1,35
2,1,38
3,1,42
4,1,43


In [13]:
# K = 10
calcMAPrecisionAtK(queriesResult, qrelsFrame)

0.002631578947368421

0.012719298245614033 l1 - qFreq - DFreq
0.014181286549707602 l1 - qFreq - DFreq - just .W
0.023830409356725146 l1 - qFreq - DFreq - just .W ngram = (1,2)

In [None]:
# K = 10
averagePrecision = calcAPrecisionAtK(queriesResult, qrelsFrame)
for i in range(0,len(averagePrecision), 2):
    print(i+1, end=' - '); print(averagePrecision[i])

### i stoped here