In [9]:
# coding: utf-8
from label_propagation import LGC, HMN, PARW, OMNI, CAMLP
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score
from scipy.sparse import dok_matrix
import numpy as np
import networkx as nx
from collections import Counter
import pickle
import re
import gensim
from itertools import product
from scipy.spatial.distance import euclidean
import pulp

# 다른 비교 모델
# https://networkx.github.io/documentation/stable/reference/algorithms/generated/networkx.algorithms.community.asyn_lpa.asyn_lpa_communities.html#networkx.algorithms.community.asyn_lpa.asyn_lpa_communities

with open("data/tweets.txt", 'rb') as f:
    data = pickle.load(f)
    
word2vec = gensim.models.KeyedVectors.load_word2vec_format('word2vec.txt')

In [2]:
data_ = []
for d in data:
    tmp = {}
    tmp['label'] = d['label']
    text = d['body']
    text = re.sub(r'http\S+', '', text)
    text = [w for w in text.split() if w.isalpha()]
    if len(text) < 2:
        # 단어 한개 이하는 무시
        continue
        
    text = " ".join(text).strip()
    text = text.lower()

    if text == '':
        # 빈 문자열 무시
        continue

    tmp['body'] = text
    tmp['sentence_embedding'] = d['sentence_embedding']
    data_.append(tmp)

data = data_
del data_

data = [(i, d['body'], d['label']) for i, d in enumerate(data)]

In [3]:
def tokens_to_fracdict(tokens):
    cntdict = Counter(tokens)
    totalcnt = sum(cntdict.values())
    return {token: float(cnt)/totalcnt for token, cnt in cntdict.items()}

# use PuLP
def word_mover_distance_probspec(first_sent_tokens, second_sent_tokens, wvmodel):
    all_tokens = list(set(first_sent_tokens+second_sent_tokens))
    wordvecs = {token: wvmodel[token] for token in all_tokens}

    first_sent_buckets = tokens_to_fracdict(first_sent_tokens)
    second_sent_buckets = tokens_to_fracdict(second_sent_tokens)

    T = pulp.LpVariable.dicts('T_matrix', list(product(all_tokens, all_tokens)), lowBound=0)

    prob = pulp.LpProblem('WMD', sense=pulp.LpMinimize)
    prob += pulp.lpSum([T[token1, token2]*euclidean(wordvecs[token1], wordvecs[token2])
                        for token1, token2 in product(all_tokens, all_tokens)])
    for token2 in second_sent_buckets:
        prob += pulp.lpSum([T[token1, token2] for token1 in first_sent_buckets])==second_sent_buckets[token2]
    for token1 in first_sent_buckets:
        prob += pulp.lpSum([T[token1, token2] for token2 in second_sent_buckets])==first_sent_buckets[token1]

    prob.solve()
    return prob

def word_mover_distance(first_sent_tokens, second_sent_tokens, wvmodel):
    prob = word_mover_distance_probspec(first_sent_tokens, second_sent_tokens, wvmodel)
    return pulp.value(prob.objective)

In [4]:
G = nx.Graph()
for i, body, label in data:
    words_i = set(body.split())
    for j, body2, label2 in data[i+1:]:
        words_j = set(body2.split())
        
        if len(words_i.intersection(words_j)) > 4:
            G.add_edge(i, j)

In [5]:
num_data = len(data)
adjacency = dok_matrix((num_data, num_data), dtype=np.float32)

print(num_data)
print(len(G.adj))
for idx, i in enumerate(G.adj):
    if idx % 100 == 0:
        print(idx)
    token_i = [w for w in data[i][1].split() if w in word2vec]
    for j in G.adj[i]:
        token_j = [w for w in data[j][1].split() if w in word2vec]
        distance = word_mover_distance(token_i, token_j, word2vec)
        adjacency[i, j] = distance
        adjacency[j, i] = distance

adjacency = adjacency.tocsr()

5600
1747
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700


In [6]:
np.savez("adjacency.npz", adjacency)

In [7]:
from random import shuffle
labels = {'related': 1, 'not related': 0}
related_data_indexes = [d[0] for d in data if d[2] == 'related']
nonrelated_data_indexes = [d[0] for d in data if d[2] == 'not related']

shuffle(related_data_indexes)
shuffle(nonrelated_data_indexes)

train_related_indexes = related_data_indexes[:int(len(related_data_indexes)*0.1)]
train_related_indexes = sorted(train_related_indexes)
test_related_indexes = related_data_indexes[int(len(related_data_indexes)*0.1):]
test_related_indexes = sorted(test_related_indexes)
train_nonrelated_indexes = nonrelated_data_indexes[:int(len(related_data_indexes)*0.1)]
train_nonrelated_indexes = sorted(train_nonrelated_indexes)
test_nonrelated_indexes = nonrelated_data_indexes[int(len(related_data_indexes)*0.1):]
test_nonrelated_indexes = sorted(test_nonrelated_indexes)

# related
X_train = [i for i in train_related_indexes]
# not related
X_train.extend([i for i in train_nonrelated_indexes])
X_train = np.array(X_train)

# related
X_test = [i for i in test_related_indexes]
# not related
X_test.extend([i for i in test_nonrelated_indexes])
X_test = np.array(X_test)

# related
y_train = [labels[data[i][2]] for i in train_related_indexes]
# not related
y_train.extend([labels[data[i][2]] for i in train_nonrelated_indexes])
y_train = np.array(y_train)

# related
y_test = [labels[data[i][2]] for i in test_related_indexes]
# not related
y_test.extend([labels[data[i][2]] for i in test_nonrelated_indexes])
y_test = np.array(y_test)

In [10]:
methods = [('HMN', HMN(), {'graph': [adjacency]}),
           ('LGC', LGC(), {'graph': [adjacency], 'alpha':[
            0.001, 0.1, 0.5, 0.75]}),
           # ('PARW', PARW(), {'graph':[adjacency], 'lamb':[0.01, 0.05, 0.01, 0.5, 0.99]}),
           ('OMNI', OMNI(), {'graph': [adjacency], 'lamb':[
            0.001, 0.025, 0.005, 0.01]}),
           ('CAMLP', CAMLP(), {'graph': [adjacency], 'beta':[
            0.01, 0.025, 0.05, 0.1], 'H':[np.array([[1, 0], [0, 1]]), np.array([[0, 1], [1, 0]])]})]

models = []
for name, clf, params in methods:
    print("=========================")
    print(name)
    gs = GridSearchCV(clf, params, cv=10, n_jobs=10)
    gs.fit(X_train, y_train)

    model = gs.best_estimator_
    print(gs.best_params_)
    model.fit(X_train, y_train)
    models.append(model)

    predicted = model.predict(X_test)
    print("Accuracy: %f" % accuracy_score(y_test, predicted))
    print("Recall: %f" % recall_score(y_test, predicted))
    print("Precision: %f" % precision_score(y_test, predicted))
    print("f1 score: %f" % f1_score(y_test, predicted))

HMN
{'graph': <5600x5600 sparse matrix of type '<class 'numpy.float32'>'
	with 22644 stored elements in Compressed Sparse Row format>}
Accuracy: 0.853468
Recall: 0.343604
Precision: 0.815965
f1 score: 0.483574
LGC
{'alpha': 0.75, 'graph': <5600x5600 sparse matrix of type '<class 'numpy.float32'>'
	with 22644 stored elements in Compressed Sparse Row format>}
Accuracy: 0.848434
Recall: 0.355742
Precision: 0.755952
f1 score: 0.483810
OMNI
{'graph': <5600x5600 sparse matrix of type '<class 'numpy.float32'>'
	with 22644 stored elements in Compressed Sparse Row format>, 'lamb': 0.025}
Accuracy: 0.849925
Recall: 0.319328
Precision: 0.818182
f1 score: 0.459369
CAMLP
{'H': array([[1, 0],
       [0, 1]]), 'beta': 0.05, 'graph': <5600x5600 sparse matrix of type '<class 'numpy.float32'>'
	with 22644 stored elements in Compressed Sparse Row format>}
Accuracy: 0.824385
Recall: 0.362278
Precision: 0.599691
f1 score: 0.451688
