# 정보검색 retrieval
### http://www.cs.virginia.edu/~hw5x/Course/IR2015/_site/lectures/
# 사용자의 need에 맞게 정보를 제공하는 것!
### 주요이슈
### 1. 쿼리표현 : 문법적 차이, 의미적 차이
### 2. 문서표현 : 효율적인 접근을 위한 자료구조
### 3. 검색모델

1. Indexer<br>
    - Crawler (Focused) => repository(collection)<br>
    - Document Analyzer => Html, Tokenizing, Normalizing<br>
                            Stemming(BPE), Ngram, MA, POS, Stopwords, RE, Pharses
                            => preprocessing
    - Features => Lexicon<br>
    - Document(Query) Representation => Bag Of Words (BOW)
      Document-Term Mat, Term-Document Mat.(핵심)
      => Inverted Document Indexing(역문헌 구조)
      
2. Relevance(Ranking)
    - Weighting(TF-IDF), Similarity(Cosine:0-1, Euclidean)<br>
    - Sorting
3. Results<br>
    - 끝(Top k)

# Lexicon

In [38]:
%timeit getLexicon1()

149 ms ± 18.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [39]:
%timeit getLexicon2()

137 ms ± 9.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [19]:
lexicon = getLexicon2()

In [20]:
len(lexicon)

2638

### documentRepresentation 성능 향상 방법

In [21]:
#      1, 2, 3 .. 2638 => Bag Of Words
# doc1 0  0  0         => doc개수 * 2638
# doc2 1  1  1
def documentRepresentation1():
    documentList = list()
    for document in [kobill.open(idx).read() for idx in kobill.fileids()]:
        bow = list(0 for _ in range(len(lexicon)))
        for term in document.split():
            bow[lexicon.index(term)] = 1
        documentList.append(bow)
    return documentList

In [26]:
temp = documentRepresentation1()
len(temp[0])

2638

### 리스트로 표현했지만 파이썬은 dict로 sparse한 문제점을 해결할 수 있음!

In [53]:
def documentRepresentation2():
    documentList = list()
    for document in [kobill.open(idx).read() for idx in kobill.fileids()]:
        bow = dict()
        for term in document.split():
            bow[lexicon.index(term)] = 1
        documentList.append(bow)
    return documentList

# 있는지 없는지만 검사하니까 index를 굳이 뒤질 필요가 없음
def documentRepresentation3():
    documentList = list()
    for document in [kobill.open(idx).read() for idx in kobill.fileids()]:
        bow = dict()
        for term in document.split():
            bow[term] = 1
        documentList.append(bow)
    return documentList

In [29]:
temp = documentRepresentation2()
len(temp[0].keys())

541

In [31]:
temp = documentRepresentation3()
len(temp[0].keys())

541

In [35]:
%timeit documentRepresentation1()

314 ms ± 48.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [36]:
%timeit documentRepresentation2()

324 ms ± 46.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [37]:
%timeit documentRepresentation3()

8.79 ms ± 774 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


### 빈도수에 대한 가중치가 없다

In [54]:
from collections import defaultdict
def documentRepresentation4():
    documentList = list()
    for document in [kobill.open(idx).read() for idx in kobill.fileids()]:
        bow = defaultdict(int)
        for term in document.split():
            bow[term] += 1
        documentList.append(bow)
    return documentList

In [None]:
doc = documentRepresentation4()
doc[0]

In [57]:
%timeit documentRepresentation3()

8.43 ms ± 748 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [58]:
%timeit documentRepresentation4()

9.55 ms ± 502 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [64]:
def documentRepresentation5():
    documentList = defaultdict(lambda: defaultdict(int))
    for idx in kobill.fileids():
        for term in kobill.open(idx).read().split():
            documentList[idx][term] += 1
    return documentList

In [65]:
docList = documentRepresentation5()

In [None]:
docList.keys()
docList['1809891.txt']

## doc term M => term doc M

In [92]:
# Boolean 검색 => 집합론
# DTM
query = "국회 의원 국민"
result = list()

def booleanResult1():
    for term in query.split():
        searchResult = list()
        for idx, termList in docList.items():
            if term in termList.keys():
                searchResult.append(idx)
        result.append(searchResult)
    one = result.pop()
    while result:
        temp = result.pop()
        one = list(set(one).intersection(temp))
    return one

In [101]:
# TDM    doc1 doc2 doc3
# term1 :  3    1    9 
# term2 :  1    2    5 
# query문의 term을 기준으로 볼 수 있으니까
# key만 가져오면 lexicon
# 보통 lexicon을 hash로 두고 doc vector를 posting으로 둠 pointer를 스위치하는 게 보통 


TDM = defaultdict(lambda:defaultdict(int))
for idx, termList in docList.items():
    for term, freq in termList.items():
        TDM[term][idx] = freq
        
        
def booleanResult2():
    result = list()

    for term in query.split():
        result.append(list(TDM[term].keys()))
        
    one = result.pop()
    while result:
        temp = result.pop()
        one = list(set(one).intersection(temp))
    return one

In [102]:
%timeit booleanResult1()

11 µs ± 1.66 µs per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [103]:
%timeit booleanResult2()

4.52 µs ± 416 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
