# CISI Word Embedding

## distribute data

In [None]:
CISIDATA = '../../../../CISI/CISI.ALL'
import re
IDMarker = re.compile('(\.I.)')
allMarkers = re.compile('(\.[ITABWX] )')

In [None]:
CISIQUERY = '../../../../CISI/CISI.QRY'
CISIQRELS = '../../../../CISI/CISI.REL'
import re
queryMarkers = re.compile('(\.[ITAWB] )')

In [None]:
def getData(PATH, marker):
    """get the data from the file and split it by ID"""
    with open(PATH, 'r') as f:
        t = f.read().replace('\n', ' ')
        lines = re.split(marker, t)
        lines.pop(0)
    return lines

### cisi data

In [None]:
cisiData = getData(CISIDATA, allMarkers)

In [None]:
import pandas as pd
dataFrame = pd.DataFrame()
seriesDict:dict = {
    '.I': None,
    '.T': None,
    '.A': None,
    '.B': None,
    '.W': None,
    '.X': None
}
seriesData = seriesDict.copy()
notTheFirst = False
for i in range(0, len(cisiData), 2):
    if (notTheFirst and cisiData[i].strip() == '.I'):
        dataFrame = dataFrame.append(seriesData, ignore_index=True)
        seriesData = seriesDict.copy()
    
    seriesData[cisiData[i].strip()] = cisiData[i+1].strip()
    notTheFirst = True
dataFrame = dataFrame.append(seriesData, ignore_index=True)
dataFrame.head()

In [None]:
dataFrame.to_csv('../../cisiData/cisiCsvWE.csv')

### query

In [None]:
cisiQuery = getData(CISIQUERY, queryMarkers)

In [None]:
import pandas as pd
qDataFrame = pd.DataFrame()
seriesDict:dict = {
    '.I': None,
    '.T': None,
    '.A': None,
    '.W': None,
    '.B': None
}
seriesData = seriesDict.copy()
notTheFirst = False
for i in range(0, len(cisiQuery), 2):
    if (notTheFirst and cisiQuery[i].strip() == '.I'):
        qDataFrame = qDataFrame.append(seriesData, ignore_index=True)
        seriesData = seriesDict.copy()
    
    seriesData[cisiQuery[i].strip()] = cisiQuery[i+1].strip()
    notTheFirst = True
qDataFrame = qDataFrame.append(seriesData, ignore_index=True)
qDataFrame.head()

In [None]:
qDataFrame.to_csv('../../cisiData/cisiQueryCsvWE.csv')

### query rel

In [None]:
import pandas as pd

def getRles(path):
    with open(path, 'r') as f:
        global qrlesList
        qrlesList = f.read().split('\n')
        return qrlesList



qrelsData = getRles(CISIQRELS)
qrelsFrame = pd.DataFrame(columns=['.I', 'data'])
seriesDict:dict = {'.I':None, 'data':None}
seriesData = seriesDict.copy()
for i in qrelsData:
    try:
        element = i.split()
        seriesData['.I'] = int(element[0])
        seriesData['data'] = int(element[1])
        qrelsFrame = qrelsFrame.append(seriesData, ignore_index=True)
    except:
        pass
qrelsFrame.head()

In [None]:
qrelsFrame.to_csv('../../cisiData/cisiQRelsWE.csv')

## data preprocessing

### cleanning

In [None]:
import pandas as pd
df = pd.read_csv('../../cisiData/cisiCsvWE.csv', index_col=[0])
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
print(df.loc[:,'.T'].isnull().value_counts(), end='\n\n')
print(df.loc[:,'.W'].isnull().value_counts(), end='\n\n')
print(df.loc[:,'.A'].isnull().value_counts())

In [None]:
df.fillna('', inplace=True)

In [None]:
df.duplicated().value_counts()

In [None]:
df.to_csv('../../cisiData/cisiDataCleanedWE.csv')

### methods

#### lower case

In [None]:
def toLower(text):
    return text.lower()

#### remove puntuation

In [None]:
import string
translator = str.maketrans(string.punctuation,' '*len(string.punctuation))
def removePunctuation(text):
    global translator
    return text.translate(translator)


#### remove whitespaces

In [None]:
def removeWhiteSpace(text):
    return " ".join(text.split())

### cici.all process

In [None]:
import pandas as pd
import re

def TitlePreProcesse(t):
    tempText = t
    tempText = toLower(t)
    tempText = removePunctuation(tempText)
    tempText = removeWhiteSpace(tempText)
    return tempText

def abstractPreProcesse(a):
    tempText = a
    tempText = toLower(a)
    tempText = removePunctuation(tempText)
    tempText = removeWhiteSpace(tempText)

    return tempText

def publicationPreProcesse(p): 
    try:
        return pd.to_datetime(p)
    except:
        return None
    
def authorPreProcesse(a):
    tempText = a
    tempText = toLower(a)

    lis = tempText.split(' ')
    names = ' '
    l = []
    for word in lis:
      if ',' in word:
          l.append(removePunctuation(word.replace(',','')))
    names = ' '.join(l)
    return names


In [None]:
import pandas as pd
def preprocessedData(dataFrame:pd.DataFrame):
    pdataFrame = pd.DataFrame()
    seriesDict:dict = {} 
    for i in dataFrame.index:
        try:
            tempT = tempA = tempW = ''
            tempB = None
            if not dataFrame.loc[i, '.T'] == '':
                tempT = TitlePreProcesse(dataFrame.loc[i, '.T'])
            if not dataFrame.loc[i, '.A'] == '':
                tempA = authorPreProcesse(dataFrame.loc[i, '.A'])
            if not dataFrame.loc[i, '.B'] == '':
                tempB = publicationPreProcesse(dataFrame.loc[i, '.B'])
            if not dataFrame.loc[i, '.W'] == '':
                tempW = abstractPreProcesse(dataFrame.loc[i, '.W'])

            seriesDict['.I'] = i +1
            seriesDict['data'] = ' '.join([tempT, tempA, tempW])
            seriesDict['.B'] = tempB
            
            pdataFrame = pdataFrame.append(seriesDict, ignore_index=True)
            seriesDict = {}
        except:
            print(i)
            raise 
    pdataFrame.fillna('', inplace=True)
    return pdataFrame


In [None]:
import pandas as pd
data = pd.read_csv('../../cisiData/cisiDataCleanedWE.csv', index_col=[0])
data.fillna('', inplace=True)
data.head()

In [None]:
processedDAta = preprocessedData(data)
processedDAta.head()

In [None]:
processedDAta.to_csv('../../cisiData/cisiDataPreprocessedWE.csv')

### CISI.QRY process

In [None]:
import pandas as pd
import re


def qTitlePreProcesse(t):
    tempText = t
    tempText = toLower(tempText)
    tempText = removePunctuation(tempText)
    tempText = removeWhiteSpace(tempText)
    return tempText

def qAbstractPreProcesse(a):
    tempText = a
    tempText = toLower(tempText)
    tempText = removePunctuation(tempText)
    tempText = removeWhiteSpace(tempText)
    return tempText

def qAuthorPreProcesse(a):
    tempText = a
    tempText = toLower(tempText)
    lis = tempText.split(' ')
    names = ' '
    l = []
    for word in lis:
      if ',' in word:
          l.append(removePunctuation(word.replace(',','')))
    names = ' '.join(l)
    return names


regexPub = r'[0-9]{4}'

def qPublicationPreProcesse(p):
    tempText = p.split(',')[3]
    try:
        return pd.to_datetime(tempText)
    except TypeError:
        return pd.to_datetime(str(re.search(regexPub, tempText)))
    except:
        return None


In [None]:
import pandas as pd
def preprocesseQuery(dataFrame:pd.DataFrame):
    qdataFrame = pd.DataFrame()
    seriesDict:dict = {} 
    for i in dataFrame.index:
        try:
            tempT = tempA = tempW = ''
            tempB = None
            if not dataFrame.loc[i, '.T'] == '':
                tempT = TitlePreProcesse(dataFrame.loc[i, '.T'])
            if not dataFrame.loc[i, '.A'] == '':
                tempA = authorPreProcesse(dataFrame.loc[i, '.A'])
            if not dataFrame.loc[i, '.B'] == '':
                tempB = publicationPreProcesse(dataFrame.loc[i, '.B'])
            if not dataFrame.loc[i, '.W'] == '':
                tempW = abstractPreProcesse(dataFrame.loc[i, '.W'])

            seriesDict['.I'] = i + 1
            seriesDict['data'] = ' '.join([tempT, tempA, tempW])
            seriesDict['.B'] = tempB
            
            qdataFrame = qdataFrame.append(seriesDict, ignore_index=True)
            seriesDict = {}
        except:
            print(i)
            raise 
    qdataFrame.fillna('', inplace=True)
    return qdataFrame


In [None]:
import pandas as pd
querydf = pd.read_csv('../../cisiData/cisiQueryCsvWE.csv', index_col=[0])
querydf.fillna('', inplace=True)
querydf.head()

In [None]:
preprocessedQuery = preprocesseQuery(querydf)
preprocessedQuery.head()

In [None]:
preprocessedQuery.to_csv('../../cisiData/cisiQueryPreprocessedWE.csv')

## indexing

In [None]:
import pandas as pd
data = pd.read_csv('../../cisiData/cisiDataPreprocessedWE.csv', index_col=[0])
data.fillna('', inplace=True)
data.head()

In [None]:
import spacy
nlp = spacy.load("en_core_web_md")

In [None]:
vectors = [nlp(data.loc[i, 'data']) for i in data.index]

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


def search(query,n:int):
    qvector = nlp(query)
    similarities = []
    for i in vectors:
        sim = qvector.similarity(i)
        similarities.append(sim)
    
    temp = np.array(similarities)
    nearest = temp.argsort()[-n:][::-1]

    ids = []
    for i in nearest:
        ids.append(data.loc[i,'.I'])

    return ids
# ex query num 1 (too bad result)
search(preprocessedQuery.loc[0,'data'],10)

In [None]:
def queryingData(qDataFrame:pd.DataFrame, n):
    result = pd.DataFrame()
    resultDict:dict = {}
    resultDictCopy = resultDict.copy()
    for i in qDataFrame.index:
        try:
            tempList:list = search(qDataFrame.loc[i,'data'], n)
            for id in range(1,n+1):
                resultDictCopy[str(id)] = tempList[id - 1]
            result = result.append(resultDictCopy, ignore_index=True)
            resultDictCopy = resultDict.copy()
        except:
            print(i)
            raise
    return result


In [None]:
queriesPath = '../../cisiData/cisiQueryPreprocessedWE.csv'

queriesData = pd.read_csv(queriesPath, index_col=[0])
queriesData.fillna('', inplace=True)
queriesData.head()

In [None]:
queriesResult = queryingData(queriesData, 20)
queriesResult.head()

In [None]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score

def reSizeLists(l1:list, l2:list):
    '''resize lists to have the same len'''
    if len(l1) < len(l2):
        l2 = l2[0:len(l1)]
    while len(l1) > len(l2):
        l1 = l1[0:len(l2)]

    return l1, l2


def precWithoutOrder(l1:list,l2:list):
    ''' calculate precision witout orering'''
    try:
        return len(set(l1).intersection(set(l2))) / len(l2)
    except:
        return 0

def calcMAPrecisionAtK(resData:pd.DataFrame, qrelsData: pd.DataFrame):
    '''calcualte MAP'''
    precisionsAtK:list = []
    precisionAtK:float

    for i in resData.index:
        resArray = resData.loc[i].to_numpy()
        qresArray = qrelsData.loc[qrelsData['.I'] == i+1, 'data'].to_numpy()
        
        if len(qresArray) == 0: 
            continue

        resArray, qresArray = reSizeLists(resArray, qresArray)



        prec = precision_score(qresArray, resArray, average='micro')
        precisionsAtK.append(prec)

    precisionAtK = sum(precisionsAtK) / len(precisionsAtK)
    return precisionAtK

def calcAPrecisionAtK(resData:pd.DataFrame, qrelsData: pd.DataFrame):
    '''calcualte Average Precision'''
    precisionsAtK:list = []

    for i in resData.index:
        precisionOnQuery = []

        resArray = resData.loc[i].to_numpy()
        qresArray = qrelsData.loc[qrelsData['.I'] == i+1, 'data'].to_numpy()
        
        if len(qresArray) == 0: 
            continue
        resArray, qresArray = reSizeLists(resArray, qresArray)

        for lenI in range(0,len(qresArray)):

            tempRes:list = resArray[0:lenI+1].tolist()
            tempQRes:list = qresArray[0:lenI+1].tolist()
            precisionOnQuery.append(precision_score(tempQRes, tempRes, average='micro'))

        try:
            precisionsAtK.append(sum(precisionOnQuery) / len(precisionOnQuery))
        except ZeroDivisionError: 
            precisionsAtK.append(0)
    return precisionsAtK


In [None]:
import pandas as pd
qrelsFrame = pd.read_csv('../../cisiData/cisiQRelsWE.csv', index_col=[0])
qrelsFrame.head()

In [None]:
# K = 10
calcMAPrecisionAtK(queriesResult, qrelsFrame)