In [1]:
# Some practice for word2vec
# Get contends from an article and process it into a bag of word (continuous bag-of-word model, CBOW)
# Read sentences based on "，", not " " or "\n"

import scipy
import numpy
import gensim
import re
import jieba
import zhon
import csv

from gensim import corpora
from gensim import models
from timeit import default_timer as timer


# Receive a list containing lines of an article
# Remove unwanted characters
# Retrun a list containing sentences of an article
def sentence_check(a):
    # The faster way is creating a list, then checking any unwanted characters before replacing it. 
    # However, this way produces multiple output, which increase some complexity 
    print("run sentence check")
    check_list=a
    return_list=[]
    pattern = r',|\.|/|;|\'|`|\[|\]|<|>|\?|:|"|\{|\}|\~|!|@|#|\$|%|\^|&|\(|\)|-|=|\_|\+|，|。|、|；|‘|’|【|】|·|！| |…|（|）' 
    
    for element in check_list:
        element = re.split(pattern,element)
        for t in element:
            if t != '':
                return_list.append(t)
    
    # Remove \n in list
    return_list = [x for x in return_list if x != '\n']
    return(return_list)


# Read sentences of an article and process them as a bag of words
# Use jieba module to recognize chinese words
# Need a dictionary of common tradition Chinese words
# Need a dictionary of stop words
# Return a list of lists of words (still keep the sentence information)
def word(a):
    print("Run word, powered by JieBa")
    input_list = a
    jieba.set_dictionary('dict.txt')
    stops = open('stop.txt', 'r', encoding='utf8') 
    stopword = stops.read().split('\n')
    stops.close()
    
    # Update stop words
    stopword.append('\n')
    
    new_list=[]
    i = 0
    
    # Remove any stop words from list
    while i < len(input_list):
        element = input_list[i]
        new_list.append([t for t in jieba.cut(element) if t not in stopword])
        i+=1
    
    return(new_list)


# Read a list of lists of words
# Merge all lists
# Return a list of words of an article (a bag of word)
def bag_of_words(a):
    print("Run bag_of_words")
    input_list = a
    new_list=[]
    i = 0
    while i < len(input_list):
        new_list += input_list[i]
        i += 1
    return(new_list)        


# Select sentences containing interested words
def selector(a,b):
    print("Run sentence_selector")
    input_list = a
    word = b
    new_list=[]
    if word == "":
        new_list = input_list
        return(new_list)

    else:
        i = 0
        while i < len(input_list):
            element = input_list[i]
            j = 0
            while j < len(element):
                t = element[j]
                if t == word:
                    new_list.append(element)
                j+=1
            i+=1
    
    return(new_list)

    
# Recive a bag of words
# Count frequency of a word in a bag of words
# Return processed corpus
def word_count(a):
    print("Run word_count")
    input_list = a
    from collections import Counter
    frequency = Counter(input_list)
    c = frequency.items()
    return_list=[]
    for element in c:
        if element[1] > 1:
            return_list.append(element[0]) # Remove word only appearing once
    return(return_list)
    
    
def main():
    print("Run main")
    csvfile = open("Okinawa_Travel.csv",newline='')
    csvreader = csv.reader(csvfile,delimiter=",")
    title_list=[]
    for row in csvreader:
        row = row[0].replace('[遊記]','')
        title_list.append(row)
    csvfile.close()
    sentence_list = sentence_check(title_list)
    word_list=word(sentence_list)
    selected_words = selector(word_list,input("Enter one interested word: "))
    bag_word = bag_of_words(word_list) # Pool all words in one list
    processed_word_of_bag = word_count(bag_word) # Count frequency of each word, return words appearing more than once
    processed_corpus = []
    for element in word_list:
        temp_list=[]
        i = 0
        while i < len(element):
            for a in processed_word_of_bag:
                if element[i] == a:
                    temp_list.append(element[i])
            i+=1
            
        b = temp_list
        if b != []:
            processed_corpus.append(b)
    
    dictionary = corpora.Dictionary(processed_corpus)
    bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]
    
    print("train models")
    tfidf = models.TfidfModel(bow_corpus)
    
    test_string = "購物 樂桃 香草 潛水 OIST 九州 象鼻岩 自駕"
    a = tfidf[dictionary.doc2bow(test_string.lower().split())]
    print(test_string,"",a)
    
    
    
    
if __name__ == "__main__":
    print("initialize")
    main() 

Building prefix dict from /Users/hsieh/Documents/ptt/dict.txt ...
Loading model from cache /var/folders/54/w85qh3vd4xv5v6qz8w4t70940000gn/T/jieba.ubaa545e936adb4bd7ef290058a1bf422.cache


initialize
Run main
run sentence check
Run word, powered by JieBa


Loading model cost 0.744 seconds.
Prefix dict has been built succesfully.


Enter one interested word: 
Run sentence_selector
Run bag_of_words
Run word_count
train models
購物 樂桃 香草 潛水 OIST 九州 象鼻岩 自駕  [(0, 0.5708070844632387), (10, 0.45083390741665147), (42, 0.29330600939894064), (169, 0.6204028087464465)]
