In [1]:
# Some practice for word2vec
# Get contends from an article and process it into a bag of word (continuous bag-of-word model, CBOW)
# Read sentences based on "，", not " " or "\n"

import scipy
import numpy
import gensim
import re
import jieba
import zhon
import csv

from gensim import corpora
from gensim import models
from timeit import default_timer as timer


# Receive a list containing lines of an article
# Remove unwanted characters
# Retrun a list containing sentences of an article
def sentence_check(a):
    # The faster way is creating a list, then checking any unwanted characters before replacing it. 
    # However, this way produces multiple output, which increase some complexity 
    print("run sentence check")
    check_list=a
    return_list=[]
    pattern = r',|\.|/|;|\'|`|\[|\]|<|>|\?|:|"|\{|\}|\~|!|@|#|\$|%|\^|&|\(|\)|-|=|\_|\+|，|。|、|；|‘|’|【|】|·|！| |…|（|）' 
    
    for element in check_list:
        element = re.split(pattern,element)
        for t in element:
            if t != '':
                return_list.append(t)
    
    # Remove \n in list
    return_list = [x for x in return_list if x != '\n']
    return(return_list)


# Read sentences of an article and process them as a bag of words
# Use jieba module to recognize chinese words
# Need a dictionary of common tradition Chinese words
# Need a dictionary of stop words
# Return a list of lists of words (still keep the sentence information)
def word(a):
    print("Run word: extract words from sentences by JieBa")
    input_list = a
    jieba.set_dictionary('dict.txt')
    stops = open('stop.txt', 'r', encoding='utf8') 
    stopword = stops.read().split('\n')
    stops.close()
    
    # Update stop words
    stopword.append('\n')
    
    new_list=[]
    i = 0
    
    # Remove any stop words from list
    while i < len(input_list):
        element = input_list[i]
        new_list.append([t for t in jieba.cut(element) if t not in stopword])
        i+=1
    
    print("Jieba result: ",new_list)
    return(new_list)


# Read a list of lists of words
# Merge all lists
# Return a list of words of an article (a bag of word)
def bag_of_words(a):
    print("Run bag_of_words")
    input_list = a
    new_list=[]
    i = 0
    while i < len(input_list):
        new_list += input_list[i]
        i += 1
    return(new_list)        


# Select sentences containing interested words
def selector(a,b):
    print("Run sentence_selector")
    input_list = a
    word = b
    new_list=[]
    if word == "":
        new_list = input_list
        return(new_list)

    else:
        i = 0
        while i < len(input_list):
            element = input_list[i]
            j = 0
            while j < len(element):
                t = element[j]
                if t == word:
                    new_list.append(element)
                j+=1
            i+=1
    
    return(new_list)

    
# Recive a bag of words
# Count frequency of a word in a bag of words
# Return processed corpus
def word_count(a):
    print("Run word_count")
    input_list = a
    from collections import Counter
    frequency = Counter(input_list)
    c = frequency.items()
    return_list=[]
    for element in c:
        if element[1] > 1:
            return_list.append(element[0]) # Remove word only appearing once
    return(return_list)
    
    
def main():
    print("Run main")
    csvfile = open("Okinawa_Travel.csv",newline='')
    csvreader = csv.reader(csvfile,delimiter=",")
    title_list=[]
    for row in csvreader:
        row = row[0].replace('[遊記]','')
        title_list.append(row)
    csvfile.close()
    sentence_list = sentence_check(title_list)
    word_list=word(sentence_list)
    selected_words = selector(word_list,input("Enter one interested word: "))
    bag_word = bag_of_words(word_list) # Pool all words in one list
    processed_word_of_bag = word_count(bag_word) # Count frequency of each word, return words appearing more than once
    print("words used for traing: ",processed_word_of_bag)
    processed_corpus = []
    for element in word_list:
        temp_list=[]
        i = 0
        while i < len(element):
            for a in processed_word_of_bag:
                if element[i] == a:
                    temp_list.append(element[i])
            i+=1
            
        b = temp_list
        if b != []:
            processed_corpus.append(b)
    
    dictionary = corpora.Dictionary(processed_corpus)
    bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]
    
    print("train models")
    tfidf = models.TfidfModel(bow_corpus)
    
    test_string = input("Enter your string: ")
    a = tfidf[dictionary.doc2bow(test_string.lower().split())]
    print(test_string,"",a)
    
    
    
    
if __name__ == "__main__":
    print("initialize")
    main() 

Building prefix dict from /Users/hsieh/Documents/ptt/dict.txt ...
Loading model from cache /var/folders/54/w85qh3vd4xv5v6qz8w4t70940000gn/T/jieba.ubaa545e936adb4bd7ef290058a1bf422.cache


initialize
Run main
run sentence check
Run word: extract words from sentences by JieBa


Loading model cost 0.746 seconds.
Prefix dict has been built succesfully.


Jieba result:  [['TITLE'], ['親子', '跨年', '沖繩', '樂桃', '機票'], ['OTS', '租車', '費用'], ['沖繩', 'kkday', '租車', '心得'], ['沖繩'], ['豬肉', '蛋', '飯糰', '那霸', '機場', '店', '苦瓜', '天麩', '羅'], ['沖繩', '特色', '伴手禮', '介紹'], ['沖繩', 'Birthday', '嬰幼', '用品', '名護', '店', '親子', '購物'], ['沖繩', '自助', '．', 'Orix', '租車', '分享', '親子', '車型', '推薦'], ['沖繩', '沖繩', '樂遊', '美麗', '海', '套票', '景點', '套票', '分享'], ['沖繩', '景點', '沖繩', '世界', '文化', '王國'], ['玉泉', '洞'], ['沖繩', '美麗', '海', '水族館'], ['海洋', '博', '公園', '海豚秀'], ['沖繩', '親子', '遊'], ['親愛的'], ['首里', '城變', '好玩'], ['沖繩', '親子', '景點', '沖繩', '自然', '動植物', '公園'], ['沖繩', 'LCC', '航廈', '體驗'], ['唯一', '一次', '體驗'], ['沖繩', '名護', '鳳梨', '公園', '親子'], ['雨天', '旅遊', '推薦'], ['沖繩', '春節', '四日', '吃喝', '血拼', '小旅行', '心得', '前言'], ['沖繩', '國際', '通', '購物', '札幌', '藥妝', '2019', '上半', '購物', '清單', '分享'], ['沖繩', '自駕', '自撞', '車禍', '分享'], ['奇遇'], ['沖繩', '浜', '之家', '田芋', '恩怨'], ['歌山'], ['沖繩', '．', 'mont'], ['bell', '購物', '記錄'], ['聖誕', '沖繩', '搭車遊'], ['下集', '南部'], ['沖繩'], ['日航', '櫻花', '貴賓室', '沖繩', '那霸', '國內', '航廈'], ['沖繩', '自由行

Enter one interested word: 
Run sentence_selector
Run bag_of_words
Run word_count
words used for traing:  ['親子', '沖繩', '樂桃', 'OTS', '租車', '心得', '那霸', '機場', '店', '名護', '購物', '自助', '．', '分享', '推薦', '美麗', '海', '套票', '景點', '世界', '文化', '王國', '玉泉', '洞', '水族館', '海洋', '博', '公園', '海豚秀', '遊', '首里', '自然', '動植物', '航廈', '體驗', '旅遊', '四日', '血拼', '小旅行', '國際', '通', '札幌', '自駕', '車禍', '之家', '搭車遊', '南部', '日航', '貴賓室', '自由行', '一個', '人', '旅行', '六天', '五夜', '行程', '浮島', '美食', '四天', '三夜', '爸爸', '出遊', '牛排', '燒肉', '聖誕節', '中北部', 'KKday', '一日', '美國村', '傑克', '再', '地圖', '北部', '巴士', '✈', '2018', '重點', '經驗', 'ORIX', '離島', '伊江島', '浮潛', '單車', '攻略', '美景', '天', '夜', '日本', '麗星', '郵輪', '之旅', '五天', '四夜', '瀨', '長島', '還車', 'ご', '家族', '八天七夜', '總', 'Apple', '魚市場', '搭', '初訪', '訪', '月', '中', '高雄', '虎航', '回', '腳踏車', '遊記', '逛街', '單軌', '電車', '教學', '懶人包', '古宇利島', '蝦', '飯店', '條列式', '沙灘', '小', '記', '青洞', '兒童', '玩', '精簡', '行', '古宇利', '海灘', '21', 'Vlog', '五日', '公主號', '可到', '整理', '篇', '工廠', '大橋', '13', '家庭號', '推', '規劃', '12', '超', '萬座', '毛',