# LDA


In [27]:
import scipy.stats as stats
import scipy.special as special
from sklearn.feature_extraction.text import CountVectorizer
import dill

class LDA(object):
    PRINT_EVERYITER = 1
    
    def __init__(self, n_iter, n_topic, max_df=0.3, min_df=0.01, lang='ja'):
        self.n_iter = n_iter
        self.n_topic = n_topic
        self.max_df = max_df
        self.min_df = min_df
        self.topics = [i for i in range(self.n_topic)]
        self.alpha = None
        self.beta = None
        self.lang = lang
        
    def __sampling_theta(self, alpha):
        return stats.dirichlet.rvs(alpha=alpha)[0]
    
    def __sampling_phi(self, beta):
        return stats.dirichlet.rvs(alpha=beta)[0]
    
    def __sampling_z(self, d, v):
        """
        @param d int 文章番号
        @param v int 文章dのi番目の単語の単語辞書での番号
        
        @return k int 単語vのトピック
        """
        theta_d = self.theta[d]
        phi_v = self.phi[:, v]
        sum_weight = np.dot(theta_d, phi_v.reshape(-1, 1))
        weight = theta_d*phi_v / sum_weight
        
        return np.random.choice(self.topics, size=1, p=weight)
    
    def __init_dirichlet_param(self):
        self.alpha = np.array([10 / self.n_topic
                               for _ in range(self.n_topic)])
        self.beta = np.array([100 / self.vocab_size
                              for _ in range(self.vocab_size)])
    
    def __update_dirichlet_param(self, ave_n_d_k):
        """
        update alpha
        
        @param ave_n_d_k ndarray n_d_kのサンプル平均
        """
        alpha_sum = np.sum(self.alpha)
        
        # alphaのupdate
        for k in range(self.n_topic):
            alpha_k = self.alpha[k]
            
            a_k_d = [(special.psi(ave_n_d_k[d, k] + alpha_k)\
                      - special.psi(alpha_k)) * alpha_k
                     for d in range(self.n_docs)]
            
            b_d = [special.psi(np.sum(ave_n_d_k[:,k]) + alpha_sum)\
                   - special.psi(alpha_sum)
                   for _ in range(self.n_docs)]
            
            self.alpha[k] = np.sum(a_k_d) / np.sum(b_d)
    
    def __make_vocab(self, X):
        """
        文書集合から単語辞書を作成し、文書をbag-of-wordsに変換
        
        @param X list 文書集合 要素は、各文章を単語に分割したもの
        
        @return vocab dict 文書内の単語辞書
        @return X_bow list 文書をbag-of-wordsで表現した文書集合
        """
        if self.lang == 'ja':
            # 文書は既に単語の集合に変換されているので、splitする必要がない。
            analyzer = lambda words: words
            self.vect = CountVectorizer(min_df=self.min_df,
                                        max_df=self.max_df,
                                        analyzer=analyzer)
        else:
            self.vect = CountVectorizer(max_features=10000,
                                        max_df=self.max_df)
        X_bow = self.vect.fit_transform(X).toarray()
        return self.vect.vocabulary_, X_bow
    
    def fit(self, X):
        """
        self.vocabを求める関数未実装
        
        @param X list 文書集合
        @return self 学習済みモデル
        """
        # 辞書作成と文書をbag-of-wordsに変換
        self.vocab, X_bow = self.__make_vocab(X)
        print('finished making vocab')
        
        self.n_docs = len(X_bow)
        self.vocab_size = len(self.vocab)
        
        self.__init_dirichlet_param()
        
        # θとφの初期化
        # theta ndarray theta[d,k] 文章dにトピックkが出現する確率を保持
        self.theta = np.array([self.__sampling_theta(alpha=self.alpha)
                               for _ in range(self.n_docs)])
        
        # phi ndarray phi[k,v] トピックkに単語vが含まれている確率を保持
        self.phi = np.array([self.__sampling_phi(beta=self.beta)
                             for _ in range(self.n_topic)])
        
        # n_d_k ndarray n_d_k[d,k] 文章d出現したトピックkの数
        self.n_d_k = np.zeros((self.n_docs, self.n_topic))
        
        # n_k_v ndarray n_k_b[k,v] トピックがkの単語vの個数
        self.n_k_v = np.zeros((self.n_topic, self.vocab_size))
        
        # n_d_kのこれまでのサンプル合計
        sum_n_d_k = np.zeros((self.n_docs, self.n_topic))
        
        print('start iterating')
        for s in range(self.n_iter):
            # 初期化
            self.n_d_k.fill(0.0)
            self.n_k_v.fill(0.0)
            
            for d in range(self.n_docs):
                for v, n_word in enumerate(X_bow[d]):
                    if n_word ==0:
                        continue
                    
                    # 単語d,i=vのトピック
                    k = self.__sampling_z(d, v)
                    
                    # それぞれの個数をプラス1
                    self.n_k_v[k, v] += n_word
                    self.n_d_k[d, k] += n_word
                
                # 更新されたn_d_kから、θ_dをサンプリング
                params = self.n_d_k[d] + self.alpha
                theta_d = self.__sampling_theta(alpha=params)
                # サンプリングされた値で更新
                self.theta[d] = theta_d
            
            # 更新されたn_k_vから、φ_kをサンプリング
            for k in range(self.n_topic):
                params = self.n_k_v[k] + self.beta
                phi_k = self.__sampling_phi(beta=params)
                # サンプリングされた値で更新
                self.phi[k] = phi_k
            
            # n_d_kのサンプル合計数をプラス
            sum_n_d_k += self.n_d_k
            # n_d_kのサンプル平均を計算
            ave_n_d_k = sum_n_d_k / (s+1)
            self.__update_dirichlet_param(ave_n_d_k)
            
            if (s+1) % self.PRINT_EVERYITER == 0:
                print('{} iter finished !'.format(s+1))
        return self
    
    def __predict_pdf(self, test_lda, d, v):
        pdf = 0.0
        
        for k in range(self.n_topic):
            pdf += (test_lda.theta[d,k] * self.phi[k,v])
            
        return pdf
                    
        
    def perplexity(self, test1, test2):
        """
        test1を用いてテスト用パラメータ学習
        test2を用いてperplexityを計算
        
        @param test1 list perplexityの計算に用いるΘを学習するための文書集合
        @param test2 list テストデータの集合
        
        @return perplexity float 
        """
        # perplexityを計算するためにΘ^(test)を学習
        test_lda = LDA(self.n_iter, self.n_topic)
        test_lda.fit(test1)
        
        likelihood = 0.0
        n_words = 0
        for d, doc in enumerate(test2):
            for word in doc:
                # 単語を番号に変化
                v = self.vocab.get(word, False)
                # 単語が辞書になかったら飛ばす
                if not v:
                    continue
                likelihood += np.log(self.__predict_pdf(test_lda, d, v))
                n_words += 1
        return np.exp(-likelihood / n_words)
    
    def print_topn_pertopic(self, n=5):
        index_to_words = {v: k for k, v in self.vocab.items()}
        for k in range(self.n_topic):
            print('-----topic {}-----'.format(k))
            index_phi_k = self.phi[k].argsort()[::-1]
            for print_num, v in enumerate(index_phi_k):
                if print_num >= n:
                    break
                
                print('{}, pdf:{}'.format(index_to_words[v],
                                          self.phi[k, v]))
                
            


def load_train_test():
    """
    @return train list 学習用の文書集合
    @return test list テスト用の文書集合
    """
    read_dir = './data/ldcourpas/'
    train_doc_name = 'train_doclist.list'
    test_doc_name = 'test_doclist.list'
    
    with open(read_dir + train_doc_name, mode='rb') as f:
        train = pickle.load(f)
    with open(read_dir + test_doc_name, mode='rb') as f:
        test = pickle.load(f)
    
    return train, test


def split_testdata(test, ratio):
    """
    @param ratio float testデータを ratio : (1 - ratio) に分割する
    
    @return test1 list perplexity計算の際にΘを求めるデータ
    @return test2 list perplexity計算用のデータ
    """
    test1 = []
    test2 = []
    
    for doc in test:
        n_words = len(doc)
        # 小数点以下、四捨五入
        index = int(round(n_words*ratio, 0))
        test1.append(doc[:index])
        test2.append(doc[index:])
    
    return test1, test2
        

def main():
    train, test = load_train_test()
    test1, test2 = split_testdata(test, ratio=.5)
    
    lda = LDA(n_iter=100, n_topic=10)
    lda.fit(train)
    with open('./lda.model', mode='wb') as f:
        dill.dump(lda, f)
    lda.perplexity(test1, test2)
    lda.print_topn_pertopic(n=10)
    
    
main()
        

finished making vocab
start iterating
1 iter finished !
2 iter finished !
3 iter finished !
4 iter finished !
5 iter finished !
6 iter finished !
7 iter finished !
8 iter finished !
9 iter finished !
10 iter finished !
11 iter finished !
12 iter finished !
13 iter finished !
14 iter finished !
15 iter finished !
16 iter finished !
17 iter finished !
18 iter finished !
19 iter finished !
20 iter finished !
21 iter finished !
22 iter finished !
23 iter finished !
24 iter finished !
25 iter finished !
26 iter finished !
27 iter finished !
28 iter finished !
29 iter finished !
30 iter finished !
31 iter finished !
32 iter finished !
33 iter finished !
34 iter finished !
35 iter finished !
36 iter finished !
37 iter finished !
38 iter finished !
39 iter finished !
40 iter finished !
41 iter finished !
42 iter finished !
43 iter finished !
44 iter finished !
45 iter finished !
46 iter finished !
47 iter finished !
48 iter finished !
49 iter finished !
50 iter finished !
51 iter finished !
52

## gensimのLDA


In [26]:
import gensim
from gensim import corpora, models
import pickle

model_dir = './model/gensim/'
with open('./data/ldcourpas/train_doclist.list', mode='rb') as f:
    docs = pickle.load(f)
dictionary = corpora.Dictionary(docs)
dictionary.filter_extremes(no_below=20, no_above=0.3)
#dictionary.save_as_text(model_dir + 'dict.txt')

corpus = [dictionary.doc2bow(doc) for doc in docs]
#corpora.MmCorpus.serialize(model_dir + 'cop.mm', corpus)

n_topics = 20
lda = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                      num_topics=n_topics,
                                      id2word=dictionary)

for i in range(n_topics):
    print('--------topic {}--------'.format(i))
    print(lda.print_topic(i))


--------topic 0--------
0.014*"映画" + 0.010*"作品" + 0.009*"公開" + 0.009*"月日" + 0.007*"ライブ" + 0.006*"今夏" + 0.006*"女性" + 0.006*"たち" + 0.005*"回" + 0.005*"歳"
--------topic 1--------
0.035*"充電" + 0.027*"更新" + 0.017*"事象" + 0.010*"端子" + 0.007*"S" + 0.007*"対応" + 0.006*"ACアダプタ" + 0.006*"円" + 0.006*"ケーブル" + 0.006*"台"
--------topic 2--------
0.011*"D" + 0.007*"撮影" + 0.006*"日本" + 0.006*"つとむ" + 0.006*"機能" + 0.005*"映画" + 0.005*"時間" + 0.004*"今夏" + 0.004*"アプリ" + 0.004*"ソフトウェア"
--------topic 3--------
0.019*"smartphone" + 0.015*"対応" + 0.014*"D" + 0.014*"発表" + 0.013*"MAX" + 0.013*"S" + 0.013*"NTTドコモ" + 0.012*"円" + 0.010*"利用" + 0.010*"向け"
--------topic 4--------
0.034*"独女" + 0.014*"歳" + 0.013*"アプリ" + 0.012*"友達" + 0.011*"さん" + 0.009*"とき" + 0.008*"彼" + 0.007*"男" + 0.006*"女性" + 0.006*"紹介"
--------topic 5--------
0.025*"更新" + 0.016*"搭載" + 0.014*"S" + 0.013*"smartphone" + 0.012*"対応" + 0.012*"画面" + 0.010*"表示" + 0.010*"機能" + 0.008*"利用" + 0.008*"ソフトウェア"
--------topic 6--------
0.029*"更新" + 0.017*"ソフトウェア" + 0.013*"機

## livedoorニュースコーパスをLDA用に変換

データセットは[ここ](http://www.rondhuit.com/download.html#ldcc)

### stopwordのリスト作成&読み込み


In [20]:
import pickle

IN_DIR = './data/'
TXT_FILE_NAME = 'Japanese.txt'
OUT_DIR = './model/stopword'
OUT_NAME = 'stopwords.list'

def make_stop_wordslist():
    stop_words = []
    with open(IN_DIR + TXT_FILE_NAME, 'r') as f:
        for line in f:
            line = line.strip()
            if line != '':
                stop_words.append(line)
    print(stop_words)
    with open(OUT_DIR + OUT_NAME, 'wb') as f:
        pickle.dump(stop_words, f)
    
def get_stop_words():
    with open(OUT_DIR + OUT_NAME, 'rb') as f:
        stop_words = pickle.load(f)
    return stop_words

make_stop_wordslist()

['それ', 'さん', 'くん', 'そう', 'うん', 'あそこ', 'あたり', 'あちら', 'あっち', 'あと', 'あな', 'あなた', 'あれ', 'いくつ', 'いつ', 'いま', 'いや', 'いろいろ', 'うち', 'おおまか', 'おまえ', 'おれ', 'がい', 'かく', 'かたち', 'かやの', 'から', 'がら', 'きた', 'くせ', 'ここ', 'こっち', 'こと', 'ごと', 'こちら', 'ごっちゃ', 'これ', 'これら', 'ごろ', 'さまざま', 'さらい', 'さん', 'しかた', 'しよう', 'すか', 'ずつ', 'すね', 'すべて', 'ぜんぶ', 'そう', 'そこ', 'そちら', 'そっち', 'そで', 'それ', 'それぞれ', 'それなり', 'たくさん', 'たち', 'たび', 'ため', 'だめ', 'ちゃ', 'ちゃん', 'てん', 'とおり', 'とき', 'どこ', 'どこか', 'ところ', 'どちら', 'どっか', 'どっち', 'どれ', 'なか', 'なかば', 'なに', 'など', 'なん', 'はじめ', 'はず', 'はるか', 'ひと', 'ひとつ', 'ふく', 'ぶり', 'べつ', 'へん', 'ぺん', 'ほう', 'ほか', 'まさ', 'まし', 'まとも', 'まま', 'みたい', 'みつ', 'みなさん', 'みんな', 'もと', 'もの', 'もん', 'やつ', 'よう', 'よそ', 'わけ', 'わたし', 'ハイ', '上', '中', '下', '字', '年', '月', '日', '時', '分', '秒', '週', '火', '水', '木', '金', '土', '国', '都', '道', '府', '県', '市', '区', '町', '村', '各', '第', '方', '何', '的', '度', '文', '者', '性', '体', '人', '他', '今', '部', '課', '係', '外', '類', '達', '気', '室', '口', '誰', '用', '界', '会', '首', '男', '女', '別', '話', '私', '屋', '店', '家', '

### 文書を単語に分割する分割器


In [13]:
import MeCab

class Extractor(object):
    INDEX_POS = 0
    INDEX_BASE_FORM = 6
    TARGET_POS = ["名詞", " 動詞",  "形容詞", "感動詞"]
    def __init__(self):
        neologd_path = '/usr/local/lib/mecab/dic/mecab-ipadic-neologd'
        self.tagger = MeCab.Tagger('-o chasen -d ' + neologd_path)
        self.stop_words = get_stop_words()
        
    def extract_words(self, sentence):
        """
        日本語の文書を単語集合に変換（その際、単語は原型に直される）。
        
        @param text str 文書
        @return words list 文書からターゲットの品詞の単語のみを抜き出したもの
        """
        
        if not sentence:
            return []
        
        words = []
        
        node = self.tagger.parseToNode(sentence)
        while node:
            features = node.feature.split(',')
            # 品詞がターゲットの品詞だった場合に処理
            if features[self.INDEX_POS] in self.TARGET_POS:
                # 原型がなかったら、表層系を保持
                if features[self.INDEX_BASE_FORM] == '*':
                    word = node.surface
                else:
                    word = features[self.INDEX_BASE_FORM].replace('。', '')
                # 数字のみかどうか
                only_digit = re.match(r'^[0-9]+$', word)
                # ひらがな１文字のみかどうか
                len_one = re.match(r'^[あ-ん]$', word)
                if word not in self.stop_words and not only_digit and not len_one:
                    words.append(word)
            
            node = node.next

        return words


In [25]:
import os
import re
import pickle
import mojimoji
import codecs

class CourpasReader(object):
    ROOT_PATH = './data/text/'
    DIR_NAMES = ['dokujo-tsushin',
                 'kaden-channel',
                 'movie-enter']
    
    DIR_NAMES = ['movie-enter',
                 'it-life-hack',
                 'kaden-channel',
                 'topic-news',
                 'livedoor-homme',
                 'peachy',
                 'sports-watch',
                 'dokujo-tsushin',
                 'smax']
    
    REMOVE_FILENAMES = 'LICENSE.txt'
    
    OUT_DIR = './data/ldcourpas'
    NAME_TRAIN_DOC = 'train_doclist.list'
    NAME_TRAIN_TITLE = 'train_titlelist.list'
    NAME_TEST_DOC = 'test_doclist.list'
    NAME_TEST_TITLE = 'test_titlelist.list'
    
    TRAIN_SIZE_PER_GENRE = 300
    TEST_SIZE_PER_GENRE = 200
    
    def __init__(self):
        self.extractor = Extractor()
    
    def __digit_zen_to_han(self, text):
        """
        テキスト内の全角数字を半角数字に変換
        
        @param text str
        @return text_transformed str text内の全角数字を半角数字に変換したもの
        """
        text_transfomed = text
        match = re.findall(r'[０-９]+', text_transfomed)
        for zen in match:
            han = mojimoji.zen_to_han(zen)
            text_transfomed = text_transfomed.replace(zen, han, 1)
        return text_transfomed
    
    def __create_doc(self, path):
        """
        @param path str 読み込む記事のパス
        
        @return doc list 記事を単語単位に分割したもの。
        """
        
        doc = []
        
        with codecs.open(path, 'r', 'utf-8') as f:
            for i, line in enumerate(f):
                # 最初の2行はurlと時刻なのでカット
                if i > 1:
                    """
                    # 改行文字を「。」に置き換え
                    line = re.sub(r'[\n]', '。', line)
                    # 「。」が2個以上連続してたら１つにする
                    line = re.sub(r'[。]+', '。', line)
                    # 全角特殊文字
                    line = re.sub(r'[！？”“％＆＄＃（）［］]', '', line)
                    """
                    line = line.encode('utf-8').decode('utf-8')
                    line = line.strip()
                    line = self.__digit_zen_to_han(line)
                    line = re.sub(r'[\s!"#$%&()=^~{}\[\]+*;:@]', '', line)
                    if line != '':
                        words = self.extractor.extract_words(line)
                        doc.extend(words)
        return doc
    
    def __print_doc(self, path):
        doc = ''
        
        with open(path, 'r') as f:
            for i, line in enumerate(f):
                print(line)
                
                
    def __save_obj(self, path, obj):
        with open(path, mode='wb') as f:
            pickle.dump(obj, f)
        print('model saved')
    
    def extract_doclist(self):
        """
        各ディレクトリから、文書名と文書を抜き出し学習データとテストデータを作成
        """
        train_doclist = []
        train_titlelist = []
        test_doclist = []
        test_titlelist = []
        
        for dir_name in self.DIR_NAMES:
            dir_path = self.ROOT_PATH + dir_name
            titles = os.listdir(dir_path)
            titles.remove(self.REMOVE_FILENAMES)
            print(len(titles))
            end = self.TRAIN_SIZE_PER_GENRE + self.TEST_SIZE_PER_GENRE
            for i, title in enumerate(sorted(titles[0:end])):
                doc = self.__create_doc(dir_path + '/' + title)
                
                if i == 10:
                    print('------------------{}------------------'\
                          .format(title))
                    print(doc)
                    
                
                if i < self.TRAIN_SIZE_PER_GENRE:
                    train_doclist.append(doc)
                    train_titlelist.append(title)
                else:
                    test_doclist.append(doc)
                    test_titlelist.append(title)
        
        self.__save_obj(path=self.OUT_DIR + self.NAME_TRAIN_DOC,
                        obj=train_doclist)
        
        self.__save_obj(path=self.OUT_DIR + self.NAME_TRAIN_TITLE,
                        obj=train_titlelist)
        
        self.__save_obj(path=self.OUT_DIR + self.NAME_TEST_DOC,
                        obj=test_doclist)
        
        self.__save_obj(path=self.OUT_DIR + self.NAME_TEST_TITLE,
                        obj=test_titlelist)
        print('--------FINISH!!!--------')

        
def main():
    reader = CourpasReader()
    reader.extract_doclist()
    

main()
    

870
------------------movie-enter-5847102.txt------------------
['照英', 'カウボーイ&エイリアン', '宣伝', 'ナビゲーター', '就任', '涙', '感動', '10月22日', '公開', '映画', 'カウボーイ&エイリアン', '宣伝', 'ナビゲーター', 'タレント', '照英', '就任', '本作', '豪華', 'スタッフ', 'キャスト', 'SF', 'アクション', '大作', '物語', '舞台', '1873年', 'アリゾナ', '西部', '未知', '敵', '人々', '敵', '誰か', '記憶', '1人', 'カウボーイ', '巨大', '敵', '夜空', '腕輪', '青い', '閃光', '一体', 'カウボーイ', '何者', '巨大', '敵', '目的', '本作', 'アメリカ', '伝説', 'ビジュアルブック', 'COWBOY', 'ALIENS', '魅了', 'スティーヴン・スピルバーグ', '映画化', '実現', 'ロン・ハワード', 'アイアンマン', 'ジョン・ファブロー', '集結', '驚愕', '映像', '世界', '再現', 'キャスト', 'ダニエル・クレイグ', 'ハリソン・フォード', '豪華', '顔ぶれ', '勢揃い', '照英', '本作', '宣伝', 'ナビゲーター', '就任', '決定', 'インディ・ジョーンズ', 'E.T.', 'スピルバーグ', '映画', '僕', '涙', '興奮', '熱意', '最高', '本作', '魅力', '熱い', '宣伝', 'ナビゲーター', 'コメント', '今後', 'ない', 'smartphone', 'AR', '拡張現実', '活用', '本作', '魅力', '熱い', '予定', '現在', '一部', 'インターネット上', '照英', '画像', 'フレーズ', '絶大', '人気', '照英', '本作', 'ナビゲーター', '話題', '映画', 'カウボーイ&エイリアン', '10月22日', '丸の内ピカデリー', '全国ロードショー', 'カウボーイ&エイリアン', '公式サイト', 'カウボーイ&エイリアン', 

## 映画レビューのデータセット

IMDbWebサイトの映画レビューデータセット([ここ](http://ai.stanford.edu/~amaas/data/sentiment/)から入手できる)を利用

In [9]:
from sklearn.datasets import load_files
import pickle

reviews_train = load_files('data/aclImdb/train/')
text_train, y_train = reviews_train.data, reviews_train.target
text_train = [doc.replace(b'<br />', b'') for doc in text_train]

reviews_test = load_files('data/aclImdb/test/')
text_test, y_test = reviews_test.data, reviews_test.target
text_test = [doc.replace(b'<br />', b'') for doc in text_test]

imdb_dir = './data/imdb/'
with open(imdb_dir + 'text_test.list', mode='wb') as f:
    pickle.dump(text_test, f)
with open(imdb_dir + 'y_test.list', mode='wb') as f:
    pickle.dump(y_test, f)
with open(imdb_dir + 'text_train', mode='wb') as f:
    pickle.dump(text_train, f)
with open(imdb_dir + 'y_train', mode='wb') as f:
    pickle.dump(y_train, f)

In [37]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pickle


imdb_dir = './data/imdb/'
with open(imdb_dir + 'text_test.list', mode='rb') as f:
    text_test = pickle.load(f)
with open(imdb_dir + 'y_test.list', mode='rb') as f:
    y_test = pickle.load(f)
with open(imdb_dir + 'text_train', mode='rb') as f:
    text_train = pickle.load(f)
with open(imdb_dir + 'y_train', mode='rb') as f:
    y_train = pickle.load(f)
    

vect = CountVectorizer(max_features=10000, max_df=.15)
X = vect.fit_transform(text_train[0:2000])

lda_sklearn = LatentDirichletAllocation(n_components=10, learning_method='batch',
                                max_iter=25, random_state=0)
document_topics = lda_sklearn.fit_transform(X)


In [44]:
sorting = np.argsort(lda_sklearn.components_, axis=1)[:, ::-1]
feature_names = np.array(vect.get_feature_names())
mglearn.tools.print_topics(topics=range(10), feature_names=feature_names,
                          sorting=sorting, topics_per_chunk=5, n_words=10)

topic 0       topic 1       topic 2       topic 3       topic 4       
--------      --------      --------      --------      --------      
scenes        performance   show          book          play          
worst         new           star          nothing       through       
lot           fact          episode       didn          big           
another       while         luke          give          must          
look          actor         family        enough        cast          
video         old           original      ve            school        
minutes       find          own           original      woman         
although      role          part          read          world         
pretty        always        scenes        work          dominick      
same          young         look          through       awful         


topic 5       topic 6       topic 7       topic 8       topic 9       
--------      --------      --------      --------      --------      
matc

In [34]:
lda = LDA(n_iter=250, n_topic=10, max_df=.15, lang='en')
lda.fit(text_train[0:2000])
lda.print_topn_pertopic(n=10)

finished making vocab
start iterating
1 iter finished !
2 iter finished !
3 iter finished !
4 iter finished !
5 iter finished !
6 iter finished !
7 iter finished !
8 iter finished !
9 iter finished !
10 iter finished !
11 iter finished !
12 iter finished !
13 iter finished !
14 iter finished !
15 iter finished !
16 iter finished !
17 iter finished !
18 iter finished !
19 iter finished !
20 iter finished !
21 iter finished !
22 iter finished !
23 iter finished !
24 iter finished !
25 iter finished !
26 iter finished !
27 iter finished !
28 iter finished !
29 iter finished !
30 iter finished !
31 iter finished !
32 iter finished !
33 iter finished !
34 iter finished !
35 iter finished !
36 iter finished !
37 iter finished !
38 iter finished !
39 iter finished !
40 iter finished !
41 iter finished !
42 iter finished !
43 iter finished !
44 iter finished !
45 iter finished !
46 iter finished !
47 iter finished !
48 iter finished !
49 iter finished !
50 iter finished !
51 iter finished !
52

In [7]:
import re
import mojimoji

text = 'Forever２１、来店者数４００万人突破のファストファッションチェーン'\
        'Forever２１本社米国ロサンゼルス２日日本直営１号店原宿店２０１０年1月３０日'\
        '昨年4月２９日延べ来店客数４００万人突破'
match = re.findall(r'[０-９]+', text)
print(match)
for zen in match:
    han = mojimoji.zen_to_han(zen)
    text = text.replace(zen, han, 1)

print(text)

['２１', '４００', '２１', '２', '１', '２０１０', '３０', '２９', '４００']
Forever21、来店者数400万人突破のファストファッションチェーンForever21本社米国ロサンゼルス2日日本直営1号店原宿店2010年1月30日昨年4月29日延べ来店客数400万人突破


In [11]:
!wget http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt

--2017-11-17 12:46:11--  http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt
Resolving svn.sourceforge.jp... 202.221.179.25
Connecting to svn.sourceforge.jp|202.221.179.25|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2202 (2.2K) [text/plain]
Saving to: ‘Japanese.txt’


2017-11-17 12:46:11 (131 MB/s) - ‘Japanese.txt’ saved [2202/2202]



LDA.ipynb            [34mdata[m[m                 [34mmodel[m[m
[34mcache[m[m                ldcc-20140209.tar.gz sampling_test.ipynb
