In [2]:
# -*- coding: utf-8 -*-
import csv
from ckiptagger import WS
from opencc import OpenCC
import re
import gensim

cc = OpenCC('s2tw')
ws = WS("./data")

chi_stopWords=[]
with open('stopWords.txt', 'r', encoding='UTF-8') as file:
    for data in file.readlines():
        data = data.strip()
        chi_stopWords.append(data)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])

For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.



In [3]:
def preprocess(text):
    chi_tokens = []
    chi_text = "".join(re.compile(r'[\u4e00-\u9fa5]').findall(text))
    if len(chi_text) > 0:
        chi_text_seg = ws([chi_text])[0]
        chi_tokens = list(filter(lambda a: a not in chi_stopWords, chi_text_seg))

    eng_tokens = []
    eng_text = " ".join(re.compile(r'[\u0061-\u007a]+').findall(text.lower()))
    if len(eng_text) > 0:
        eng_text = gensim.parsing.remove_stopwords(eng_text)
        eng_tokens = list(gensim.utils.tokenize(eng_text))

    tokens = chi_tokens + eng_tokens
    return tokens

In [4]:
class BookmarkFullTextCorpus():
    def __init__(self, archive_path):
        self.path = archive_path
        self.dictionary = gensim.corpora.Dictionary(self.iter_texts())
        
    def iter_texts(self):
        with open(self.path, encoding='utf-8') as f:
            for row in csv.DictReader(f):
                yield preprocess(row["plain_text"])
                
    def __iter__(self):
        for document in self.iter_texts():
            yield self.dictionary.doc2bow(document)

    def __len__(self):
        return self.dictionary.num_docs
    
    def get_original(self, key):
        with open('data_sample.csv', encoding='utf-8') as f:
            for i, row in enumerate(csv.DictReader(f)):
                if i == key:
                    return row

In [5]:
content = BookmarkFullTextCorpus('data_sample.csv')

index = gensim.similarities.Similarity('data_sample.csv',
                                       content, 
                                       num_features=len(content.dictionary),
                                       num_best=15)

In [6]:
def search(query):
    query_bow = content.dictionary.doc2bow(preprocess(query))
    
    for doc, percent in index[query_bow]:
        result = content.get_original(doc)
        print("{:.5f} => {} ({})".format(percent, result['title'], result['url']))

In [7]:
search('深度學習')

0.41304 => 英雄集結：深度學習的魔法使們 (https://ithelp.ithome.com.tw/users/20112540/ironman/2064?sc=iThelpR)
0.41304 => 超簡單用Python預測股價 | FinLab 量化實驗室 (https://www.finlab.tw/%E8%B6%85%E7%B0%A1%E5%96%AE-Machine-Learning-%E9%A0%90%E6%B8%AC%E8%82%A1%E5%83%B9/)
0.06483 => [電腦視覺] 如何應用 Cloud AutoML Vision 辨識怪盜基德、工藤新一、服部平次 – 沒一村生活點滴 (https://noootown.wordpress.com/2018/07/25/apply-cloud-automl-vision-distinguish-konan/)
0.01116 => 語言支援  |  Cloud Speech-to-Text API  |  Google Cloud (https://cloud.google.com/speech-to-text/docs/languages?hl=zh-TW)


In [8]:
lsi = gensim.models.LsiModel(content,
                             num_topics=100,
                             power_iters=10,
                             id2word=content.dictionary)

In [9]:
lsi_index = gensim.similarities.Similarity('data_sample.csv',
                                           lsi[content],
                                           num_features=lsi.num_topics,
                                           num_best=15)

In [10]:
def lsi_search(query):
    query_bow = content.dictionary.doc2bow(preprocess(query))
    
    for doc, percent in lsi_index[lsi[query_bow]]:
        result = content.get_original(doc)
        print("{:.5f} => {} ({})".format(percent, result['title'], result['url']))

In [11]:
lsi_search('深度學習')

0.99641 => 英雄集結：深度學習的魔法使們 (https://ithelp.ithome.com.tw/users/20112540/ironman/2064?sc=iThelpR)
0.99641 => 超簡單用Python預測股價 | FinLab 量化實驗室 (https://www.finlab.tw/%E8%B6%85%E7%B0%A1%E5%96%AE-Machine-Learning-%E9%A0%90%E6%B8%AC%E8%82%A1%E5%83%B9/)
0.15640 => [電腦視覺] 如何應用 Cloud AutoML Vision 辨識怪盜基德、工藤新一、服部平次 – 沒一村生活點滴 (https://noootown.wordpress.com/2018/07/25/apply-cloud-automl-vision-distinguish-konan/)
0.02692 => 語言支援  |  Cloud Speech-to-Text API  |  Google Cloud (https://cloud.google.com/speech-to-text/docs/languages?hl=zh-TW)
0.00000 => A Game-Engine-Based Learning Environment Framework for Artificial General Intelligence | SpringerLink (https://link.springer.com/chapter/10.1007/978-3-319-46687-3_39)
0.00000 => Whole brain connectomic architecture to develop general artificial intelligence - ScienceDirect (https://www.sciencedirect.com/science/article/pii/S1877050918300498)
0.00000 => ShareCourse 學聯網 (https://www.sharecourse.net/sharecourse/course/content/homepage/1700)
0.00000 => 不到 2 