## Tensorflowで"Semi-supervised Clustering for Short Text via Deep Representation Learning"の実装¶
http://aclweb.org/anthology/K16-1004

In [186]:
import tensorflow as tf
import numpy as np
import sys
import random as rd
from sklearn.utils import shuffle
from sklearn.neighbors import KNeighborsClassifier
from tqdm import tqdm
from gensim.models.word2vec import Word2Vec
import MeCab
import subprocess
import itertools
import string
import sqlite3
from keras.preprocessing import sequence
rng = np.random.RandomState(1234)

# 1.データの準備

In [11]:
#
#ここで各ファイルのパスを設定します。
#

#学習に用いるデータ
dbpath = "../input/category_walkerplus.db"

#word2vecのモデル
model_path="../input/word2vec.gensim.model"

#辞書
dic_path="/usr/local/lib/mecab/dic/mecab-ipadic-neologd"

In [12]:
model = Word2Vec.load(model_path)
tagger = MeCab.Tagger("-Ochasen -d {0}".format(dic_path))

In [13]:
def cur(sourcedbname):
    con = sqlite3.connect(sourcedbname)
    cur = con.cursor()   
    sql = "select * from events"
    cur.execute(sql)
    return cur

In [176]:
Cur=cur(dbpath)

labels=[]
texts=[]
for row in Cur:
    labels.append(row[0].replace("[","").replace("]","").split(",")[1].replace(" ",""))  #Big Category
    texts.append(row[1])

## 分散表現の獲得
Data：入力の文章を分かち書きしw2vで埋め込み後padding shape=(データ数,maxlen,埋め込み次元)<br/>
Labels:正解ラベル

#### 分かち書き

In [17]:
def _tokenize(text):
    sentence = []
    node = tagger.parse(text)
    #print node
    node = node.split("\n")
    for i in range(len(node)):
        feature = node[i].split("\t")
        if feature[0] == "EOS":
            break
        hinshi = feature[3].split("-")[0]
        if "名詞" in hinshi:
            #sentence.append(feature[2].decode('utf-8'))
            sentence.append(feature[2])
        elif "形容詞" in hinshi:
            #sentence.append(feature[2].decode('utf-8'))
            sentence.append(feature[2])
        elif "動詞" in hinshi:
            #sentence.append(feature[2].decode('utf-8'))
            sentence.append(feature[2])
        elif "形容動詞" in hinshi:
            #sentence.append(feature[2].decode('utf-8'))
            sentence.append(feature[2])
        elif "連体詞" in hinshi:
            #sentence.append(feature[2].decode('utf-8'))
            sentence.append(feature[2])           
        elif "助詞" in hinshi:
            #sentence.append(feature[2].decode('utf-8'))
            sentence.append(feature[2])
            
    return sentence

### 分散表現の獲得

In [18]:
def getVector(text):
    texts = _tokenize(text)
    v = []
    for t in texts:
        if t in model.wv:
            if v == []:
                v = model.wv[t]
            else:
                v = np.vstack((v,model.wv[t]))
    if v != []:
        return v
    else:
        return np.array([])

In [33]:
Data=np.array([getVector(text) for text in texts])



In [45]:
Data = sequence.pad_sequences(Data, padding="post", truncating="post",dtype="float32")  #padding  

In [60]:
label2Label={name:i for i,name in enumerate(np.unique(labels))}  #label(名前)->Label(数字)
Labels=np.array([label2Label[label] for label in labels])  

# 2.教師データの選択

In [180]:
def load_data(Data,Labels,training_percent=0.9, supervise_percent=0.1):
    """
    Args:
        Data：入力の文章を分かち書きしw2vで埋め込み後padding shape=(データ数,maxlen,埋め込み次元)
        Labels:正解ラベル
        training_percent:trainingデータに使用する割合
        supervise_percent: supervise(教師データ)として一部与える割合(training_dataに対して)
    Returns:
        train_X:Dataの学習用
        train_y:Labelsの学習用
        test_X:Dataのテスト用
        test_y:Labelsのテスト用
        supervised:学習用データにおいて一部与える教師データのindex
    """
    #trainとtestでclassが均等になるようにsplit
    cluster_num=np.unique(Labels)
    train_index = []
    
    for i in range(len(cluster_num)):
        num = Labels[Labels==i].shape[0]
        k = int(num*training_percent)
        train_index.extend(rd.sample(list(np.where(Labels==i)[0]),k))
    
    #text classification for cnn のためにあらかじめlayerを追加
    train_X = Data[train_index][:,:,:,np.newaxis]
    train_y = Labels[train_index]
    test_X = np.delete(Data,train_index,0)[:,:,:,np.newaxis]
    test_y = np.delete(Labels,train_index,0)
    
    supervised = []

    for i in range(len(cluster_num)):
        num = train_y[train_y==i].shape[0]
        k = int(num*supervise_percent)
       
        supervised.extend(rd.sample(list(np.where(train_y==i)[0]),k))

    
    return train_X,train_y,test_X,test_y,supervised

In [175]:
train_X,train_y,test_X,test_y, supervised = load_data(Data,Labels)

## 3.CNN for Text Classificationの実装

### HyperParams

In [238]:
#窓の幅
filter_sizes = [3,5,7]
#分散表現の次元
vector_length = train_X.shape[2]
#最大系列長
sequence_length = train_X.shape[1]
#フィルターの枚数
num_filters = 16
#隠れ層
hid_dim=100
#出力次元数
output_dim=2
#クラスタリングするクラス多数
n_cluster=len(np.unique(train_y))


In [239]:
class Conv:
    def __init__(self, sequence_length,embedding_size,filter_sizes, num_filters):
        self.sequence_length=sequence_length
        self.embedding_size=embedding_size
        self.filter_sizes=filter_sizes
        self.num_filters=num_filters
    def f_prop(self,x):
        # Create a convolution + maxpool layer for each filter size
        pooled_outputs = []
        for i, filter_size in enumerate(self.filter_sizes):
            with tf.name_scope("conv-maxpool-%s" % filter_size):
                # Convolution Layer
                filter_shape = [filter_size, self.embedding_size, 1, self.num_filters]
                
                W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
                b = tf.Variable(tf.constant(0.1, shape=[self.num_filters]), name="b")
                conv = tf.nn.conv2d(
                    x,
                    W,
                    strides=[1, 1, 1, 1],
                    padding="VALID",
                    name="conv")
                
                # Apply nonlinearity
                h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
                # Maxpooling over the outputs
                pooled = tf.nn.max_pool(
                    h,
                    ksize=[1, self.sequence_length - filter_size + 1, 1, 1],
                    strides=[1, 1, 1, 1],
                    padding='VALID',
                    name="pool")
                pooled_outputs.append(pooled)

        # Combine all the pooled features
        num_filters_total = num_filters * len(self.filter_sizes)
        self.h_pool = tf.concat(pooled_outputs, 3)
        self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total])
        return self.h_pool_flat

class Dense:
    def __init__(self, in_dim, out_dim, function=lambda x: x):
        # Xavier initializer
        self.W = tf.Variable(rng.uniform(
                        low=-np.sqrt(6/(in_dim + out_dim)),
                        high=np.sqrt(6/(in_dim + out_dim)),
                        size=(in_dim, out_dim)
                    ).astype('float32'), name='W')
        self.b = tf.Variable(np.zeros([out_dim]).astype('float32'))
        self.function = function

    def f_prop(self, x):
        return self.function(tf.matmul(x, self.W) + self.b)
        
class LinearDense:
    def __init__(self, in_dim, out_dim):
        # Xavier initializer
        self.W = tf.Variable(rng.uniform(
                        low=-np.sqrt(6/(in_dim + out_dim)),
                        high=np.sqrt(6/(in_dim + out_dim)),
                        size=(in_dim, out_dim)
                    ).astype('float32'), name='W')
        self.b = tf.Variable(np.zeros([out_dim]).astype('float32'))

    def f_prop(self, x):
        return tf.matmul(x, self.W) + self.b

### グラフの構築

In [240]:
input_x = tf.placeholder(tf.float32, [None, sequence_length,vector_length,1], name="input_x")
input_t = tf.placeholder(tf.float32, [None, output_dim], name="input_y")

layers = [
    Conv(sequence_length=sequence_length,
         embedding_size=vector_length,
         filter_sizes=filter_sizes,
         num_filters=num_filters),
    Dense(num_filters * len(filter_sizes),hid_dim , tf.nn.tanh),
    LinearDense(hid_dim, output_dim)
]

def f_props(layers, x):
    for i, layer in enumerate(layers):
        x = layer.f_prop(x)
    return x

pred_y = f_props(layers, input_x)


### 誤差関数の設計
<img src='../img/jsemi.png'>

In [241]:
#目的関数の定義
def _cost(pred_y, centers, neighbor_index, sup_index, mask ,alpha=0.01,l=0):
    """
    Args:
        pred_y: text-cnn の出力
        centers:各ラベルごとの重心 shape=(cluster_num, output_dim)
        neighbor_index:train_Xがどの重心に最も近いか shape=(data_num,)
        sup_index:教師データとして用いるtrain_Xのindex shape=(sup_num,)
        mask:教師データとして用いるindexに1、それ以外に0が入ったmask
    """
    
    term1= tf.reduce_sum(tf.square(pred_y - tf.gather(centers, neighbor_index)))    
    term1_1 = alpha*tf.cast(term1, tf.float32)
    
    
    term2 = tf.reduce_sum(mask * tf.square(pred_y - tf.gather(centers, sup_index) ))
    term2_1 = (1-alpha)*tf.cast(term2, tf.float32)

    cost = tf.add(term1_1, term2_1)
    
    for i in range(n_cluster):
        i_index = i * tf.ones_like(sup_index, dtype='int32')
        
        x1 = tf.reduce_sum(tf.square(pred_y - tf.gather(centers, sup_index)),1)#正解の重心との距離
        x2 = tf.reduce_sum(tf.square(pred_y - tf.gather(centers, i_index)),1)     #i番目の重心との距離
                
        term2_2 = l+x1-x2
        
        condition = tf.greater(term2_2, 0)
        term2_3 = tf.reduce_sum(mask * tf.where(condition, term2_2, tf.zeros_like(term2_2)))
        term2_3 = tf.cast(term2_3, tf.float32)

        cost=tf.add(cost, (1-alpha)*term2_3)
        
    return cost

In [242]:
#  centers:各ラベルごとの重心 shape=(cluster_num, output_dim) 
centers= tf.placeholder(tf.float32, [None, output_dim], name="centers")

#  neighbor_index:train_Xがどの重心に最も近いか shape=(data_num,)
neighbor_index= tf.placeholder(tf.int32, [None], name="neighbor_index")

#  sup_index:教師データとして用いるtrain_Xのindex shape=(sup_num,)
sup_index=tf.placeholder(tf.int32, [None], name="supervised_index")

#  mask:教師データとして用いるindexに1、それ以外に0が入ったmask
mask=tf.placeholder(tf.float32, [None,1], name="mask")

#自作の誤差関数
cost = _cost(pred_y, centers, neighbor_index, sup_index, mask)

# 最小化にはAdamを用いる
train = tf.train.AdamOptimizer().minimize(cost)

# 4.Iteration
<img src='../img/iter.png'>

In [247]:
def assign_to_nearest(samples, centroids):
    """
    Args:
        samples:text-cnn後の出力
        centroids:クラスターそれぞれの重心 shape=(cluster_num,output_dim)
    Returns:
        nearest: 入力データと最も近いクラスターID shape=(data_num, )
    """
    #1-1.KNearest Neighborで一番近いクラスタと紐付け
    neigh = KNeighborsClassifier(n_neighbors=1)
    neigh.fit(centroids, np.arange(len(centroids)))
    nearest = neigh.predict(samples)
    
    #1-2.ハンガリアンアルゴリズムでラベル付きデータと重心を紐付ける
    
    sup_labels =train_y[supervised]
    hglabel = np.unique(sup_labels)
    
    hgx=[]  #教師データのラベルごとに平均した点（重心）を求める
    for i in hglabel:
        ind=np.where(sup_labels==i)[0]
        hgx.append(np.mean(samples[supervised][ind], axis=0))
    hgx = np.array(hgx)

    #教師ラベルごとの重心と現在の重心との距離行列
    DistanceMatrix = np.linalg.norm(hgx[:,np.newaxis,:]-centroids[np.newaxis,:,:],axis=2)  
    
    # ハンガリアンアルゴリズムで合計が一番小さくなるように紐づける 
    from scipy.optimize import linear_sum_assignment
    row_ind, col_ind = linear_sum_assignment(DistanceMatrix)
    
    #ラベルとclusterIDを紐づける
    label2id={hglabel[i]:col for i,col in enumerate(col_ind)}
    
    
    return nearest ,label2id

In [246]:
def estimate_centroids(samples, nearest_indices, sup_cent,centroids,label2id, use_supervised=True):
    """
    重心を再推定する
    重心の再推定式は簡略化してラベル付きと無しの加重平均
    Args:
        samples:text-cnn後の出力
        nearest: 入力データと最も近いクラスターID shape=(data_num, )
    Returns:
        centroids:クラスターそれぞれの重心 shape=(cluster_num,output_dim)
    """
    sup_pred=samples[supervised]
    
    for i in range(n_clusters):
        sum1 = np.sum(alpha*len(np.where(nearest_indices==i)[0]))
        sum2 = np.sum(alpha*samples[nearest_indices==i], axis=0)
    
        if use_supervised:
            newce=np.array([label2id[t] for t in train_y[supervised]])
            np.sum(samples[newce], axis=0)
            sum33 = np.sum((1-alpha)*len(np.where(newce==kk)[0]))
            sum44 = np.sum((1-alpha)*sup_pred[newce==kk], axis=0)
            
            centroids[kk] = (sum2+sum44)/(sum1+sum33)
            
        else:
            centroids[kk] = (sum2)/(sum1)
            

        
        

    #3.NNのパラメータ更新
    sup_cent = []
    Data.ix[supervised,"ID"].apply(lambda x:sup_cent.append(list(centroids[label2id[x]])))
    
    return np.array(centroids) , np.array(sup_cent)



array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5])