In [76]:
# coding: utf-8
from label_propagation import LGC, HMN, PARW, OMNI, CAMLP
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score
from scipy.sparse import dok_matrix
import numpy as np
import networkx as nx
import pickle
import re

# 다른 비교 모델
# https://networkx.github.io/documentation/stable/reference/algorithms/generated/networkx.algorithms.community.asyn_lpa.asyn_lpa_communities.html#networkx.algorithms.community.asyn_lpa.asyn_lpa_communities

with open("data/tweets.txt", 'rb') as f:
    data = pickle.load(f)

In [84]:
data_ = []
for d in data:
    tmp = {}
    tmp['label'] = d['label']
    text = d['body']
    text = re.sub(r'http\S+', '', text)
    text = [w for w in text.split() if w.isalpha()]
    if len(text) < 2:
        # 단어 한개 이하는 무시
        continue
        
    text = " ".join(text).strip()
    text = text.lower()

    if text == '':
        # 빈 문자열 무시
        continue

    tmp['body'] = text
    tmp['sentence_embedding'] = d['sentence_embedding']
    data_.append(tmp)

data = data_
del data_

data = [(i, d['body'], d['label'], d['sentence_embedding']) for i, d in enumerate(data)]

In [85]:
G = nx.Graph()
for i, body, label, embedding in data:
    words_i = set(body.split())
    for j, body2, label2, embedding in data[i+1:]:
        words_j = set(body2.split())
        
        if len(words_i.intersection(words_j)) > 4:
            G.add_edge(i, j)

In [86]:
num_data = len(data)
adjacency = dok_matrix((num_data, num_data), dtype=np.float32)

print(num_data)
print(len(G.adj))
for idx, i in enumerate(G.adj):
    sentence_emb_i = data[i][3]
    sentence_emb_i /= np.linalg.norm(sentence_emb_i)
    for j in G.adj[i]:
        sentence_emb_j = data[j][3]
        sentence_emb_j /= np.linalg.norm(sentence_emb_j)
        distance = 1 - sentence_emb_i.dot(sentence_emb_j)
        adjacency[i, j] = distance
        adjacency[j, i] = distance

adjacency = adjacency.tocsr()

5600
1747


In [87]:
from random import shuffle
labels = {'related': 1, 'not related': 0}
related_data_indexes = [d[0] for d in data if d[2] == 'related']
nonrelated_data_indexes = [d[0] for d in data if d[2] == 'not related']

shuffle(related_data_indexes)
shuffle(nonrelated_data_indexes)

train_related_indexes = related_data_indexes[:int(len(related_data_indexes)*0.1)]
train_related_indexes = sorted(train_related_indexes)
test_related_indexes = related_data_indexes[int(len(related_data_indexes)*0.1):]
test_related_indexes = sorted(test_related_indexes)
train_nonrelated_indexes = nonrelated_data_indexes[:int(len(related_data_indexes)*0.1)]
train_nonrelated_indexes = sorted(train_nonrelated_indexes)
test_nonrelated_indexes = nonrelated_data_indexes[int(len(related_data_indexes)*0.1):]
test_nonrelated_indexes = sorted(test_nonrelated_indexes)

# related
X_train = [i for i in train_related_indexes]
# not related
X_train.extend([i for i in train_nonrelated_indexes])
X_train = np.array(X_train)

# related
X_test = [i for i in test_related_indexes]
# not related
X_test.extend([i for i in test_nonrelated_indexes])
X_test = np.array(X_test)

# related
y_train = [labels[data[i][2]] for i in train_related_indexes]
# not related
y_train.extend([labels[data[i][2]] for i in train_nonrelated_indexes])
y_train = np.array(y_train)

# related
y_test = [labels[data[i][2]] for i in test_related_indexes]
# not related
y_test.extend([labels[data[i][2]] for i in test_nonrelated_indexes])
y_test = np.array(y_test)

In [88]:
methods = [('HMN', HMN(), {'graph': [adjacency]}),
           ('LGC', LGC(), {'graph': [adjacency], 'alpha':[
            0.001, 0.1, 0.5, 0.75]}),
           # ('PARW', PARW(), {'graph':[adjacency], 'lamb':[0.01, 0.05, 0.01, 0.5, 0.99]}),
           ('OMNI', OMNI(), {'graph': [adjacency], 'lamb':[
            0.001, 0.025, 0.005, 0.01]}),
           ('CAMLP', CAMLP(), {'graph': [adjacency], 'beta':[
            0.01, 0.025, 0.05, 0.1], 'H':[np.array([[1, 0], [0, 1]]), np.array([[0, 1], [1, 0]])]})]

models = []
for name, clf, params in methods:
    print("=========================")
    print(name)
    gs = GridSearchCV(clf, params, cv=10, n_jobs=10)
    gs.fit(X_train, y_train)

    model = gs.best_estimator_
    print(gs.best_params_)
    model.fit(X_train, y_train)
    models.append(model)

    predicted = model.predict(X_test)
    print("Accuracy: %f" % accuracy_score(y_test, predicted))
    print("Recall: %f" % recall_score(y_test, predicted))
    print("Precision: %f" % precision_score(y_test, predicted))
    print("f1 score: %f" % f1_score(y_test, predicted))

HMN
{'graph': <5600x5600 sparse matrix of type '<class 'numpy.float32'>'
	with 30828 stored elements in Compressed Sparse Row format>}
Accuracy: 0.867077
Recall: 0.342670
Precision: 0.976064
f1 score: 0.507256
LGC


  result = op(self.data)
  result = op(self.data)
  result = op(self.data)
  result = op(self.data)
  result = op(self.data)
  result = op(self.data)
  result = op(self.data)
  result = op(self.data)
  result = op(self.data)
  result = op(self.data)
  result = op(self.data)
  result = op(self.data)
  result = op(self.data)
  result = op(self.data)
  result = op(self.data)
  result = op(self.data)
  result = op(self.data)
  result = op(self.data)
  result = op(self.data)
  result = op(self.data)
  result = op(self.data)
  result = op(self.data)
  result = op(self.data)
  result = op(self.data)
  result = op(self.data)
  result = op(self.data)
  result = op(self.data)
  result = op(self.data)
  result = op(self.data)
  result = op(self.data)
  result = op(self.data)
  result = op(self.data)
  result = op(self.data)
  result = op(self.data)
  result = op(self.data)
  result = op(self.data)
  result = op(self.data)
  result = op(self.data)
  result = op(self.data)
  result = op(self.data)


{'alpha': 0.75, 'graph': <5600x5600 sparse matrix of type '<class 'numpy.float32'>'
	with 30828 stored elements in Compressed Sparse Row format>}
Accuracy: 0.859433
Recall: 0.392157
Precision: 0.803059
f1 score: 0.526976
OMNI


  result = op(self.data)


{'graph': <5600x5600 sparse matrix of type '<class 'numpy.float32'>'
	with 30828 stored elements in Compressed Sparse Row format>, 'lamb': 0.001}
Accuracy: 0.860738
Recall: 0.306256
Precision: 0.987952
f1 score: 0.467569
CAMLP
{'H': array([[1, 0],
       [0, 1]]), 'beta': 0.1, 'graph': <5600x5600 sparse matrix of type '<class 'numpy.float32'>'
	with 30828 stored elements in Compressed Sparse Row format>}
Accuracy: 0.858128
Recall: 0.381886
Precision: 0.805118
f1 score: 0.518049


In [73]:
print(len(predicted[predicted==0]))
print(len(predicted))

4962
5364


In [80]:
predicted = models[1].predict(X_test)