In [1]:
# coding: utf-8
from label_propagation import LGC, HMN, PARW, OMNI, CAMLP
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from scipy.sparse import dok_matrix
import numpy as np
import networkx as nx
from collections import Counter
import pickle
import re
import gensim
from itertools import product
from scipy.spatial.distance import euclidean
import pulp

# 다른 비교 모델
# https://networkx.github.io/documentation/stable/reference/algorithms/generated/networkx.algorithms.community.asyn_lpa.asyn_lpa_communities.html#networkx.algorithms.community.asyn_lpa.asyn_lpa_communities

with open("data/tweets.txt", 'rb') as f:
    data = pickle.load(f)
    
word2vec = gensim.models.KeyedVectors.load_word2vec_format('word2vec.txt')

Using TensorFlow backend.


In [12]:
data_ = []
for d in data:
    tmp = {}
    tmp['label'] = d['label']
    text = d['body']
    text = re.sub(r'http\S+', '', text)
    text = [w for w in text.split() if w.isalpha()]
    if len(text) < 2:
        # 단어 한개 이하는 무시
        continue
        
    text = " ".join(text).strip()
    text = text.lower()

    if text == '':
        # 빈 문자열 무시
        continue

    tmp['body'] = text
    tmp['sentence_embedding'] = d['sentence_embedding']
    data_.append(tmp)

data = data_
del data_

data = [(i, d['body'], d['label']) for i, d in enumerate(data)]

In [2]:
def tokens_to_fracdict(tokens):
    cntdict = Counter(tokens)
    totalcnt = sum(cntdict.values())
    return {token: float(cnt)/totalcnt for token, cnt in cntdict.items()}

# use PuLP
def word_mover_distance_probspec(first_sent_tokens, second_sent_tokens, wvmodel):
    all_tokens = list(set(first_sent_tokens+second_sent_tokens))
    wordvecs = {token: wvmodel[token] for token in all_tokens}

    first_sent_buckets = tokens_to_fracdict(first_sent_tokens)
    second_sent_buckets = tokens_to_fracdict(second_sent_tokens)

    T = pulp.LpVariable.dicts('T_matrix', list(product(all_tokens, all_tokens)), lowBound=0)

    prob = pulp.LpProblem('WMD', sense=pulp.LpMinimize)
    prob += pulp.lpSum([T[token1, token2]*euclidean(wordvecs[token1], wordvecs[token2])
                        for token1, token2 in product(all_tokens, all_tokens)])
    for token2 in second_sent_buckets:
        prob += pulp.lpSum([T[token1, token2] for token1 in first_sent_buckets])==second_sent_buckets[token2]
    for token1 in first_sent_buckets:
        prob += pulp.lpSum([T[token1, token2] for token2 in second_sent_buckets])==first_sent_buckets[token1]

    prob.solve()
    return prob

def word_mover_distance(first_sent_tokens, second_sent_tokens, wvmodel):
    prob = word_mover_distance_probspec(first_sent_tokens, second_sent_tokens, wvmodel)
    return pulp.value(prob.objective)

In [21]:
G = nx.Graph()
for i, body, label in data:
    words_i = set(body.split())
    for j, body2, label2 in data[i+1:]:
        words_j = set(body2.split())
        
        if len(words_i.intersection(words_j)) > 4:
            G.add_edge(i, j)

In [22]:
num_data = len(data)
adjacency = dok_matrix((num_data, num_data), dtype=np.float32)

print(num_data)
print(len(G.adj))
for idx, i in enumerate(G.adj):
    if idx % 100 == 0:
        print(idx)
    token_i = [w for w in data[i][1].split() if w in word2vec]
    for j in G.adj[i]:
        token_j = [w for w in data[j][1].split() if w in word2vec]
        distance = word_mover_distance(token_i, token_j, word2vec)
        adjacency[i, j] = distance
        adjacency[j, i] = distance

adjacency = adjacency.tocsr()

5600
1747
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700


In [23]:
np.savez("adjacency.npz", adjacency)

In [23]:
labels = {'related': 1, 'not related': 0}
train_num = 135
# related
X_train = [i for i in range(135)]
# not related
# X_train.extend([i for i in range(1345, 1846)])
X_train.extend([i for i in range(1345, 1345+train_num)])
X_train = np.array(X_train)

# related
X_test = [i for i in range(135, 1345)]
# not related
X_test.extend([i for i in range(1345+train_num, num_data)])
X_test = np.array(X_test)

# related
y_train = [labels[t[2]] for t in data[:135]]
# not related
y_train.extend([labels[t[2]] for t in data[1345: 1345+train_num]])
y_train = np.array(y_train)

# related
y_test = [labels[t[2]] for t in data[135: 1345]]
# not related
y_test.extend([labels[t[2]] for t in data[1345+train_num: num_data]])
y_test = np.array(y_test)

In [27]:
methods = [('HMN', HMN(), {'graph': [adjacency]}),
           ('LGC', LGC(), {'graph': [adjacency], 'alpha':[
            0.1, 0.25, 0.5, 0.75]}),
           # ('PARW', PARW(), {'graph':[adjacency], 'lamb':[0.01, 0.05, 0.01, 0.5, 0.99]}),
           ('OMNI', OMNI(), {'graph': [adjacency], 'lamb':[
            0.001, 0.025, 0.005, 0.01]}),
           ('CAMLP', CAMLP(), {'graph': [adjacency], 'beta':[
            0.01, 0.025, 0.05, 0.1], 'H':[np.array([[1, 0], [0, 1]])]})]

models = []
for name, clf, params in methods:
    print("=========================")
    print(name)
    gs = GridSearchCV(clf, params, cv=10, n_jobs=10)
    gs.fit(X_train, y_train)

    model = gs.best_estimator_
    print(gs.best_params_)
    model.fit(X_train, y_train)
    models.append(model)

    predicted = model.predict(X_test)
    print("Accuracy: %f" % accuracy_score(y_test, predicted))
    print("f1 score: %f" % f1_score(y_test, predicted))

HMN
{'graph': <6345x6345 sparse matrix of type '<class 'numpy.float32'>'
	with 52296 stored elements in Compressed Sparse Row format>}
Accuracy: 0.755556
f1 score: 0.249621
LGC
{'alpha': 0.5, 'graph': <6345x6345 sparse matrix of type '<class 'numpy.float32'>'
	with 52296 stored elements in Compressed Sparse Row format>}
Accuracy: 0.798848
f1 score: 0.247537
OMNI
{'graph': <6345x6345 sparse matrix of type '<class 'numpy.float32'>'
	with 52296 stored elements in Compressed Sparse Row format>, 'lamb': 0.001}
Accuracy: 0.741564
f1 score: 0.259434
CAMLP
{'H': array([[1, 0],
       [0, 1]]), 'beta': 0.025, 'graph': <6345x6345 sparse matrix of type '<class 'numpy.float32'>'
	with 52296 stored elements in Compressed Sparse Row format>}
Accuracy: 0.821070
f1 score: 0.243563


In [12]:
print(len(predicted[predicted==0]))
print(len(predicted))

5635
6075
