In [2]:
import json
import numpy as np
import os
from pprint import pprint
from Queue import Queue

#get network data
with open("data/Paper_2014_clean_2.json", "r") as f:
    p_data = json.load(f)

#get label dictionary
with open("data/idx_label.json", "r") as f:
    idx_label = json.load(f)
    
#get source labels
with open("data/source.json", "r") as f:
    source = json.load(f)
    
with open("data/index_phrase.json", "r") as f:
    index_phrase = json.load(f)
    
topics=np.zeros((len(p_data),len(idx_label)),dtype=float)
# the size of label vector
print len(idx_label)

IOError: [Errno 2] No such file or directory: 'data/idx_label.json'

In [64]:
# list for storing the index of all sources
source_idx=[]
# check set for preventing duplicate adding
check=set()

# citation mode = 'all_cite' or regular 'cited_by'
# all_cite treat all citations as undirected connection, whereas cited_by is directed


#build source_idx
for key in source:
    source_idx.append(int(key))
    check.add(int(key))
    for i in source[key]:
        topics[int(key), i[0]]=i[1]

        
# BFS for find legit order
ordering=[]

#build depth 1 nodes, depth measure how far this nodes is from the sources
depth_1=[]
for i in source_idx:
    for j in p_data[i]['all_cite']:
        if not int(j) in check:
            depth_1.append(int(j))
            check.add(int(j))
        
ordering.append(depth_1)

#recursively build all nodes
running=True
depth=0

while running:
    new_layer=[]
    for i in ordering[depth]:
        for j in p_data[i]['all_cite']:
            if not int(j) in check:
                check.add(int(j))
                new_layer.append(int(j))
    if(len(new_layer)==0):
        running=False
    else:
        ordering.append(new_layer)
        depth+=1

# number of nodes in network
print len(check)
# depth of the tree
print len(ordering)

387
9


In [65]:
# build edge weight by normalization
weights=[]
originality=0.05

for p in p_data:
    z = float(sum(p['all_cite_sim'])) + originality
    vec=np.array(p['all_cite_sim'])/z
    p['prop_ratio']=1-(originality/z)
    weights.append(vec)

In [66]:
from random import shuffle

# turn all str in citation into int vector
for p in p_data:
    int_vec=[]
    for i in p['all_cite']:
        int_vec.append(int(i))
    p['edge_set']=int_vec

# build and initilize topics matrix
topics=np.zeros((len(p_data),len(idx_label)),dtype=float)
for key in source:
    for i in source[key]:
        topics[int(key), i[0]]=i[1]

    
# propagation and mixing
iter_num=300
for i in range(iter_num):
    for layer in ordering:
        for node in layer:
            n_vec = p_data[node]['edge_set']
            sub = topics[n_vec, :]
            update = np.dot(weights[node], sub)
            topics[node,:]=update
            
for i in range(len(p_data)):
    p_data[i]['actual_ratio']=np.sum(topics[i])
    
actual_ratios=[]
prop_ratios=[]
for p in p_data:
    if p['actual_ratio']!=0.0:
        actual_ratios.append(p['actual_ratio'])
    if p['prop_ratio']!=0.0:
        prop_ratios.append(p['prop_ratio'])

        
# prop_ratios contain node that has a least one undirected connection      
print len(prop_ratios)
print sum(prop_ratios)/len(prop_ratios)

# actual_ratios contain no 0
print len(actual_ratios)
# actual ratios being 
print sum(actual_ratios)/len(actual_ratios)

522
0.359067022867
386
0.105726548266


In [67]:
# Simple Naive Bayes classifier that deal with mulitple label
# initilize count with additive smoothing (count+a)/(count+a*d)
alpha=1
word_count=[[alpha for i in range(len(index_phrase))] for j in range(len(idx_label))]
topic_count=[alpha*(len(index_phrase)+1) for i in range(len(idx_label))]

for i in check:
    vec = p_data[i]['phrases']
    vec_sum = p_data[i]['phrases_size']
    # actual word_count * tospics_proportion
    for j in range(len(topics[i])):
        topic_count[j] += vec_sum * topics[i][j]
        for k in vec:
            word_count[j][int(k)] += vec[k] * topics[i][j]
            
# learning stage: calculating p(word|topic)
word_proba=np.array(word_count)
for i in range (len(idx_label)):
    word_proba[i]/=topic_count[i]

In [68]:
# prediting stage (this stage can be written into fast numpy code)
# likelihood (word appear 3.5 times) = log(p^3.5) = 3.5 *log p
from math import log

prediction=[]
for p in p_data:
    vec = p['phrases']
    result=[]
    for i in range (len(idx_label)):
        likelihood=0
        for j in vec:
            likelihood+=log(word_proba[i][int(j)])*vec[j]
        result.append(likelihood)
    prediction.append(result)

    
    
# predition stage 2 (this is in numpy)
# renormalize
# can choose to filter out the low value (<1e-12) to prevent underflow problem. 
# However, it won't hurt to keep the underflow problem

pred_num=np.array(prediction)

for i in range(len(p_data)):
    pred_num[i]-=np.amax(pred_num[i])
    pred_num[i]=np.exp(pred_num[i])
    pred_num[i]/=np.sum(pred_num[i])

In [69]:
# combine propagation result and classification result
combine_result=[]
final_precision=1e-3

for i in range (len(p_data)):
    ratio=p_data[i]['actual_ratio']
    combine_vec=topics[i]*ratio
    combine_vec+=pred_num[i]*(1-ratio)
    combine_vec/=np.sum(combine_vec)
    combine_vec[combine_vec<final_precision]=0.0
    combine_vec/=np.sum(combine_vec)
    combine_result.append(combine_vec)

In [70]:
# output topics to p_data
present_threshold=0.1
output_count=[0 for i in range(len(idx_label))]

for i in range(len(p_data)):
    topic_set=[]
    for j in range(len(idx_label)): 
        if(combine_result[i][j]>present_threshold):
            topic_set.append(idx_label[str(j)])
            output_count[j]+=1
    p_data[i]['topic_set']=topic_set

    
    
    
# sanity check 0 this is source node
print combine_result[790]
# this is none source node
print combine_result[522]
print ' '

for i in range(len(idx_label)):
    print idx_label[str(i)]
    print topic_count[i]
    print output_count[i]
    print ' '

[ 0.  1.  0.]
[ 0.99683123  0.          0.00316877]
 
preference_analysis
4280.47326785
210
 
algorithm_probabilistic_approach
6951.97470051
949
 
product_system_design
5259.0992124
727
 


In [71]:
path="data/topics_output.txt"
if(os.path.isfile(path)):
    os.remove(path)
with open(path, 'w') as f:
    for p in p_data:
        head=p['index']+' '+p['title']+'\n'
        f.write(head.encode('utf-8'))
        ratio='prop_ratio: '+str(p['actual_ratio'])+'\n'
        f.write(ratio.encode('utf-8'))
        f.write('[')
        for i in p['topic_set']:
            f.write(i)
            f.write(',  ')
        f.write(']\n\n')
        f.write(p['abstract'].encode('utf-8'))
        f.write('\n\n\n')

In [72]:
top_phrases_col=[]
for i in range(len(idx_label)):
    hot_index=[]
    hot_index=np.argsort(word_proba[i],0)[::-1]
    phrases=[]
    for j in range(50):
        phrases.append(index_phrase[str(hot_index[j])])
    top_phrases_col.append(phrases)

    
for i in range(len(idx_label)):    
    print idx_label[str(i)]
    print top_phrases_col[i]
    print ' '

preference_analysis
[u'consumer', u'preferences', u'choice', u'sparse', u'predict', u'learning', u'car', u'purchase', u'data', u'consumer preferences', u'market', u'set', u'passenger', u'high dimensional', u'conjoint', u'network', u'accuracy', u'choice model', u'logit', u'association', u'original', u'representation', u'interaction', u'transformation', u'high', u'model', u'complement', u'negligible', u'ann', u'machine', u'survey', u'product', u'composite', u'dimensional', u'features', u'binary', u'existing', u'questions', u'behavior', u'improve', u'profile', u'restricted', u'neural', u'neural network', u'misleading', u'code', u'goal', u'community', u'actual', u'generation']
 
algorithm_probabilistic_approach
[u'uncertainty', u'reliability', u'probabilistic', u'reliability assessment', u'optimization', u'experimental', u'loop', u'algorithm', u'sora', u'efficient', u'robust', u'sequential', u'assessment', u'ERROR', u'epistemic', u'random', u'methodology', u'parameters', u'interval', u'mod