# CSI: A hybrid deep model for fake news detection

## Temporal event modeling based on LSTM

    - Event(article) : a series of tweets(messages) which can be Fake/Real
    - Message : Temporal behavior on a certain event. (aka tweet)
        - Each message arrives at a certain timestamp.
        - Each message has auxiliary information such as text, user_info , etc.
    - Dataset
        - Weibo : 4,664 events, (2,313/2,351 Rumor/NonRumor), 3,805,656 posts, 2,746,818 users
                Avg. # of posts/event : 816
                Max # of posts/event : 59,318
                Min # of posts/event : 10

### Description
    - input: dictionary with structure below
        event_name: {
                timestamps : sorted array of timestamps,
                text: array of tweets' texts sorted by time,
                uid: user ids involved in event,
                label: 0(real)/1(fake)
                }
    - output: label
   
### Attention: This notebook is for weibo dataset. you can modify the dictionary part for other datasets. 


In [2]:
#!pip install --user jieba

In [3]:
#!pip install --user gensim

In [25]:
from __future__ import division
import re
import collections
import pickle
import numpy as np
import os
from tqdm import tqdm_notebook as tqdm

import jieba, re

from gensim import utils
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec

import json

## creating input dictionary
modify this part to test model on your own dataset

In [57]:
import json

#loading events labels
f = open('../Data/Weibo.txt', "r")
lines = f.readlines()
f.close()

events = {}
for line in lines:
    line = line.replace('\t',' ')
    line = line.split(' ')
    line.remove('\n')

    label = line[1][-1]
    eid = line[0][4:]
    events[eid] = label
    

path = '../Data/Weibo'
train_dict_ = {}
test_dict_ = {}


# defining train, test , validation datasets
splits = json.load(open('../Data/Weiboeids','r'))
eid_train = splits['train']
eid_test = splits['test']
eid_val = splits['validation']

# all eids should be string
#making the input dictionary based on what is defined above

for event in tqdm(eid_train+eid_test):
    event_str = str(event)
    tweets = json.load(open(os.path.join(path,event_str+'.json'),"r",encoding='utf-8'))
    timestamps = []
    uid = []
    text = []
    n = len(tweets)
    index = 100
    if event in eid_test:
        index = 10
    tweets = tweets[:index]
    for tweet in tweets:
        timestamps.append(tweet['t'])
        uid.append(tweet['uid'])
        text.append(tweet['text'])
    messages = {'timestamps':timestamps,'uid':uid,'text':text,'label':events[event_str]}
    if event in eid_train:
        train_dict_[event_str] = messages
    else:
        test_dict_[event_str] = messages
        
# saving the dictionary
# ft = open('dict_.json','w')
# json.dump(dict_,ft)
    

HBox(children=(IntProgress(value=0, max=4232), HTML(value='')))

## Load preprocessed data

In [58]:
from sklearn.preprocessing import MinMaxScaler

from utils import *


def get_stats(dict_):
    nb_messages = []
    user_set = set()
    list_lengths = []
    for eid, messages in dict_.items():
        nb_messages.append(len(messages['timestamps']))
        user_set.update(messages['uid'])
        ts = np.array(messages['timestamps'], dtype=np.float32)
        list_lengths.append(ts[-1]-ts[0])
        
    return nb_messages, user_set, list_lengths

    
# Get statistics
# twitter eid is 'E741' or 'TM1211'
# weibo eid is , 3906982031327232 (use as string)
nb_messages_train, user_set_train, list_lengths_train = get_stats(train_dict_) 
nb_messages_test, user_set_test, list_lengths_test = get_stats(test_dict_) 

print("#train events : {}".format(len(eid_train)))
print("#train users : {}".format(len(user_set_train)))
print("#train messages : {}".format(np.sum(nb_messages_train)))
print()
print("#test events : {}".format(len(eid_test)))
print("#test users : {}".format(len(user_set_test)))
print("#test messages : {}".format(np.sum(nb_messages_test)))

#train events : 3777
#train users : 306494
#train messages : 347529

#test events : 455
#test users : 4335
#test messages : 4550


## Get two sets of user features
- `u_sample` : list of tuples (user_ID, # appearances). Top-K users are sampled. 
- `u_pop` : list of tuples (user_ID, # appearances). All users are sampled. 
- `user_feature` ($U$$\Sigma$) : comes from user x event matrix, $M = U$ $\Sigma$$V^T$.
    - $M[i,j]$ : 1 if i-th user from `u_pop` interacts with j-th event. Otherwise, 0.
- `user_feature_sub` ($U'$$\Sigma'$) : comes from user x user matrix, $M' = U'$ $\Sigma'$$V'^T$.
    - $M' = PP^T$
    - $P[i,j]$ : 1 if i-th user from `u_sample` interacts with j-th event. Otherwise, 0.

In [59]:
from collections import Counter

def get_Usample(dict_, most_common=50):
    '''Get U_sample who are most_common.'''
    u_sample = []
    cnt = Counter()
    for ii, (eid, value) in enumerate(dict_.items()):
        users = value['uid']
        cnt.update(users)
    return cnt.most_common(most_common)    # [(user_id, #occur in all events), ...]

def get_user_in_event(dict_, eid, u_sample):
    '''Get users who acts on a given event, eid'''
    value = dict_[eid]
    cnt = Counter(value['uid'])
    users = set(value['uid'])
    user_in_event = []
    for uid, nb_occur in u_sample:
        if uid in users:
            user_in_event.append((uid, cnt[uid]))
    return user_in_event

threshold = 20000
u_sample_train = get_Usample(train_dict_, most_common=threshold)
u_pop_train = get_Usample(train_dict_, most_common=len(user_set_train))
print("train u_sample for most common {} users is obtained.".format(threshold))
print("train u_pop for all {} users is obtained.".format(len(user_set_train)))

u_sample_test = get_Usample(test_dict_, most_common=threshold)
u_pop_test = get_Usample(test_dict_, most_common=len(user_set_test))
print("test u_sample for most common {} users is obtained.".format(threshold))
print("test u_pop for all {} users is obtained.".format(len(user_set_test)))


'''
Here are Two user-event matrices.
    1) matrix_main : (all user - all event) relation
        It has #occurrences of a user(row) in an event(col)
        It is very sparse.
        It is decomposed with smaller K.
    2) matrix_sub : (u_sample - eid_sample) relation
        It has #occurrences of a user(row) in an event(col)
        It is denser.
        It is decomposed with larger K. (usually)
'''
#train
print('train:')
user_sample2ind_train = {}
for ii, (uid, nb_occur) in enumerate(u_sample_train):
    user_sample2ind_train[uid] = ii
print("# users in u_sample : {}".format(len(user_sample2ind_train)))
user2ind_train = {}
for ii, uid in enumerate(user_set_train):
    user2ind_train[uid] = ii
print("# users : {}".format(len(user2ind_train)))
eid2ind_train = {}
for ii, eid in enumerate(eid_train):
    eid2ind_train[eid] = ii
print("# events : {}".format(len(eid2ind_train)))

#test
print('\ntest:')
user_sample2ind_test = {}
for ii, (uid, nb_occur) in enumerate(u_sample_test):
    user_sample2ind_test[uid] = ii
print("# users in u_sample : {}".format(len(user_sample2ind_test)))
user2ind_test = {}
for ii, uid in enumerate(user_set_test):
    user2ind_test[uid] = ii
print("# users : {}".format(len(user2ind_test)))
eid2ind_test = {}
for ii, eid in enumerate(eid_test):
    eid2ind_test[eid] = ii
print("# events : {}".format(len(eid2ind_test)))


'''
User Features
    Generate userid-eid matrix and Decompose it
    Truncated SVD
'''
from scipy.sparse import csr_matrix

def get_user_event_matrix(dict_, u_sample, user2ind, binary=False):
    '''Get (user,event) matrix.
    This matrix will be decomposed by TruncatedSVD (or else?)
    Only users in u_sample are considered.
    '''
    row = []
    col = []
    data = []
    jj = 0
    eid2ind = {}
    for ii, (eid, value) in tqdm(enumerate(dict_.items())):
        user_in_event = get_user_in_event(dict_, eid, u_sample)
        if len(user_in_event)==0:
            # No user in u_sample appears in this eid event.
            continue
        else:
            eid2ind[eid] = jj
#             eind = eid2ind[eid]
        for uid, nb_occur in user_in_event:
            uind = user2ind[uid]
            col.append(jj)
            row.append(uind)
            if binary:
                data.append(1)    # Binary matrix
            else:
                data.append(nb_occur)
        jj+=1
    print("{} events have at least one user in u_sample".format(jj))
    print("{} events have no user in u_sample".format(len(dict_)-jj))
    return csr_matrix((data, (row, col)), shape=(len(user2ind), len(eid2ind))), eid2ind


#train
matrix_sub_train, eid_sample2ind_train = get_user_event_matrix(train_dict_, u_sample_train, user_sample2ind_train, binary=True)
matrix_main_train, eid_main2ind_train = get_user_event_matrix(train_dict_, u_pop_train, user2ind_train, binary=True)
matrix_main_cnt_train, eid_main_cnt2ind_train = get_user_event_matrix(train_dict_, u_pop_train, user2ind_train, binary=False)
print('train:')
print("matrix_sub shape : {}".format(matrix_sub_train.shape))
print("Sparsity : {}".format(matrix_sub_train.count_nonzero()/(matrix_sub_train.shape[0]*matrix_sub_train.shape[1])))
print("matrix_main shape : {}".format(matrix_main_train.shape))
print("Sparsity : {}".format(matrix_main_train.count_nonzero()/(matrix_main_train.shape[0]*matrix_main_train.shape[1])))


#test
matrix_sub_test, eid_sample2ind_test = get_user_event_matrix(test_dict_, u_sample_test, user_sample2ind_test, binary=True)
matrix_main_test, eid_main2ind_test = get_user_event_matrix(test_dict_, u_pop_test, user2ind_test, binary=True)
matrix_main_cnt_test, eid_main_cnt2ind_test = get_user_event_matrix(test_dict_, u_pop_test, user2ind_test, binary=False)
print('test:')
print("matrix_sub shape : {}".format(matrix_sub_test.shape))
print("Sparsity : {}".format(matrix_sub_test.count_nonzero()/(matrix_sub_test.shape[0]*matrix_sub_test.shape[1])))
print("matrix_main shape : {}".format(matrix_main_test.shape))
print("Sparsity : {}".format(matrix_main_test.count_nonzero()/(matrix_main_test.shape[0]*matrix_main_test.shape[1])))


test u_sample for most common 20000 users is obtained.
test u_pop for all 4335 users is obtained.

test:
# users in u_sample : 4335
# users : 4335
# events : 455


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

455 events have at least one user in u_sample
0 events have no user in u_sample


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

455 events have at least one user in u_sample
0 events have no user in u_sample


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

455 events have at least one user in u_sample
0 events have no user in u_sample
test:
matrix_sub shape : (4335, 455)
Sparsity : 0.0022743577068836582
matrix_main shape : (4335, 455)
Sparsity : 0.0022743577068836582


In [60]:
from sklearn.utils.extmath import randomized_svd

RELOAD = False

if RELOAD:
    ### Load matrix_main
    nb_feature_main = 10     # 10 for weibo, 20 for tweet
    
    #train
    u_main_train = np.load(open('matrix/weibo_u_main_train.npy','rb'))
    sigma_main_train = np.load(open('matrix/weibo_sigma_main_train.npy','rb'))
    vt_main_train = np.load(open('matrix/weibo_vt_main_train.npy','rb'))
    user_feature_train = u_main_train.dot(np.diag(sigma_main_train))
    print("train user_feature shape : {}".format(user_feature_train.shape))
    
    
    #test
    u_main_test = np.load(open('matrix/weibo_u_main_test.npy','rb'))
    sigma_main_test = np.load(open('matrix/weibo_sigma_main_test.npy','rb'))
    vt_main_test = np.load(open('matrix/weibo_vt_main_test.npy','rb'))
    
    user_feature_test = u_main_test.dot(np.diag(sigma_main_test))
    print("test user_feature shape : {}".format(user_feature_test.shape))

    ### Load matrix_sub
    
    #train
    u_sub_train = np.load(open('matrix/weibo_u_sub_train.npy','rb'))
    sigma_sub_train = np.load(open('matrix/weibo_sigma_sub_train.npy','rb'))
    vt_sib_train = np.load(open('matrix/weibo_vt_sub_train.npy','rb'))
    user_feature_sub_train = u_sub_train.dot(np.diag(sigma_sub_train))
    
    #test
    u_sub_test = np.load(open('matrix/weibo_u_sub_test.npy','rb'))
    sigma_sub_test = np.load(open('matrix/weibo_sigma_sub_test.npy','rb'))
    vt_sib_test = np.load(open('matrix/weibo_vt_sub_test.npy','rb'))
    user_feature_sub_test = u_sub_test.dot(np.diag(sigma_sub_test))
    
    nb_feature_sub = 50
    print("train user_feature_sub shape : {}".format(user_feature_sub_train.shape))
    print("test user_feature_sub shape : {}".format(user_feature_sub_test.shape))
    print("Loading is Done.")
else:
    nb_feature_main = 10     # 10 for weibo, 20 for tweet
    n_iter = 15    # 15 for weibo, 7 for tweet
    
    #train
    u_main_train, sigma_main_train, vt_main_train = randomized_svd(matrix_main_train, n_components=100,
                                                 n_iter=n_iter, random_state=42)
    user_feature_train = u_main_train.dot(np.diag(sigma_main_train))
    print("train user_feature shape : {}".format(user_feature_train.shape))
    
    
    #test
    u_main_test, sigma_main_test, vt_main_test = randomized_svd(matrix_main_test, n_components=100,
                                                 n_iter=n_iter, random_state=42)
    user_feature_test = u_main_test.dot(np.diag(sigma_main_test))
    print("test user_feature shape : {}".format(user_feature_test.shape))

    nb_feature_sub = 50
    
    #train
    matrix_sub_train = matrix_sub_train.dot(matrix_sub_train.transpose())
    matrix_sub_array_train = matrix_sub_train.toarray()
    u_sub_train, sigma_sub_train, vt_sub_train = randomized_svd(matrix_sub_train, n_components=100,
                                              n_iter=n_iter, random_state=42)  # random_state=42
    user_feature_sub_train = u_sub_train.dot(np.diag(sigma_sub_train))
    
    #test
    matrix_sub_test = matrix_sub_test.dot(matrix_sub_test.transpose())
    matrix_sub_array_test = matrix_sub_test.toarray()
    u_sub_test, sigma_sub_test, vt_sub_test = randomized_svd(matrix_sub_test, n_components=100,
                                              n_iter=n_iter, random_state=42)  # random_state=42
    user_feature_sub_test = u_sub_test.dot(np.diag(sigma_sub_test))
    
    
    print("train user_feature_sub shape : {}".format(user_feature_sub_train.shape))
    print("test user_feature_sub shape : {}".format(user_feature_sub_test.shape))
    print("SVD is done")

test user_feature shape : (4335, 100)
train user_feature_sub shape : (20000, 100)
test user_feature_sub shape : (4335, 100)
SVD is done


In [36]:
# saving matrices
if not RELOAD:
    #train
    np.save('matrix/weibo_u_main_train.npy',u_main_train)
    np.save('matrix/weibo_sigma_main_train.npy',sigma_main_train)
    np.save('matrix/weibo_vt_main_train.npy',vt_main_train)
    
    np.save('matrix/weibo_u_sub_train.npy',u_sub_train)
    np.save('matrix/weibo_sigma_sub_train.npy',sigma_sub_train)
    np.save('matrix/weibo_vt_sub_train.npy',vt_sub_train)
    
    #test
    np.save('matrix/weibo_u_main_test.npy',u_main_test)
    np.save('matrix/weibo_sigma_main_test.npy',sigma_main_test)
    np.save('matrix/weibo_vt_main_test.npy',vt_main_test)
    
    np.save('matrix/weibo_u_sub_test.npy',u_sub_test)
    np.save('matrix/weibo_sigma_sub_test.npy',sigma_sub_test)
    np.save('matrix/weibo_vt_sub_test.npy',vt_sub_test)

## Get Doc2Vec model

In [47]:
import jieba, re

from gensim import utils
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec

threshold = 90*24
resolution = 'hour'
from matplotlib import pyplot as plt

chinese_stopwords = '、 。 〃 〄 々 〆 〇 〈〉 《 》 「 」 『 』 【】 〒 〓 〔 〕 〖 〗 〘〙 〚 〛 〛 〜 〝 〞 〟，'
rx = '[' + re.escape(''.join(chinese_stopwords.split())) + ']'


def get_sentences(dict_,eid_list):
    sentences = []
    for eid in tqdm(eid_list):
        eid = str(eid)
        messages = dict_[eid]
        ts = np.array(messages['timestamps'], dtype=np.int32)
        text_seq = np.array(messages['text'])

        if resolution=='day':
            binsize = 3600*24
        elif resolution=='hour':
            binsize = 3600
        elif resolution=='minute':
            binsize = 60
        ts2 = sorted(ts)
        cnt, bins = np.histogram(ts2, bins=range(ts2[0],ts2[0]+threshold*binsize,binsize))


        nonzero_bins_ind = np.nonzero(cnt)[0]
        nonzero_bins = bins[nonzero_bins_ind]
        hist = cnt[nonzero_bins_ind]
        inv = nonzero_bins_ind[1:]-nonzero_bins_ind[:-1]
        intervals = np.insert(inv,0,0)

        for bid, bin_left in enumerate(nonzero_bins):
            bin_right = bin_left + binsize
            try:
                del doc
            except:
                pass
            # Collecting text to make doc
            for tid, t in enumerate(ts):
                if t<bin_left:
                    continue
                elif t>=bin_right:
                    break
                else:
                    pass
                string = text_seq[tid]
                string = re.sub(r"http\S+", "", string)
                string = re.sub("[?!.,:;()'@#$%^&*-=+/\[\[\]\]]", ' ', string) # !.,:;()'@#$%^&*-_{}=+/\"
                try:
                    doc += string
                except:
                    doc = string
            if isinstance(eid, int):
                eid_str = str(eid)
            else:
                eid_str = eid
            sentences.append(TaggedDocument(words=jieba.lcut(doc), tags=[eid_str+'_%s' % bid]))
                
    return sentences
    
train_sentences = get_sentences(train_dict_,eid_train)

print("train length of sentences : {}".format(len(train_sentences)))

HBox(children=(IntProgress(value=0, max=3777), HTML(value='')))

train length of sentences : 24401
test length of sentences : 3281


In [43]:
from gensim.models import Doc2Vec

reload = False
if reload:
    doc_vectorizer = Doc2Vec.load('weibo_docTovec.model')
    print("doc_vectorizer is loaded.")
else:
    doc_vectorizer = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=8)
    doc_vectorizer.build_vocab(train_sentences)
    print("build_vocab is done.")
    doc_vectorizer.train(train_sentences,total_examples=doc_vectorizer.corpus_count,epochs=10)
    print("doc2vec training is done.")
    doc_vectorizer.save('weibo_docTovec_train.model')



build_vocab is done.


In [45]:
doc_vectorizer.wv.vectors.shape

(115577, 100)

## Create dataset

In [61]:
from sklearn.model_selection import train_test_split


def get_user_feature_in_event(dict_, eid, u_sample, user_feature_sub, user_sample2ind):
    '''Get user_feature_sub matrix for event eid'''
    user_in_event = get_user_in_event(dict_, eid, u_sample)
    nb_feature = user_feature_sub.shape[1]
    
    for uid, nb_occur in user_in_event:
        uind = user_sample2ind[uid]
        feature_vec = user_feature_sub[uind,:].reshape(1,-1)
        try:
            ret_matrix = np.concatenate((ret_matrix, feature_vec), axis=0)
        except:
            ret_matrix = feature_vec
    try:
        return ret_matrix
    except:
        ### if user_in_event is empty
        return np.zeros((1,nb_feature))

def create_dataset(dict_, eid, threshold=90, resolution='day',
                   read_text=False, embeddings_index=None, stopwords=None,
                   doc2vec_model=None, user_feature=None, user2ind=None, read_user=False, task='regression',
                   cutoff=50, return_useridx=True):
    messages = dict_[eid]
    ts = np.array(messages['timestamps'], dtype=np.int32)
    try:
        user_list = messages['uid'].tolist()
    except:
        user_list = messages['uid']
    if read_text:
        text_seq = np.array(messages['text'])
    else:
        text_seq = None
    if read_user:
        XX, XX_uidx = get_features(dict_,eid, ts, threshold=threshold, resolution=resolution, read_text=read_text,
                              text_seq=text_seq, embeddings_index=embeddings_index, stopwords=stopwords,
                              doc2vec_model=doc2vec_model, read_user=read_user,
                              user_feature=user_feature, user2ind=user2ind, user_list=user_list,
                              cutoff=cutoff, return_useridx=return_useridx)
    else:
        XX = get_features(dict_,eid, ts, threshold=threshold, resolution=resolution, read_text=read_text,
                              text_seq=text_seq, embeddings_index=embeddings_index, stopwords=stopwords,
                              doc2vec_model=doc2vec_model, read_user=read_user,
                              user_feature=user_feature, user2ind=user2ind, user_list=user_list,
                              cutoff=cutoff, return_useridx=return_useridx)

    if task=="regression":
        X = XX[:-1,:]   # (nb_sample, 2+)
        y = XX[1:,:2]
#         y = XX[1:,1]
        if len(y.shape)==1:
            return X, y.reshape(-1,1)
        elif len(y.shape)==2:
            return X, y
    elif task=="classification":
        X = XX   # (nb_sample, 2+)
        y = int(messages['label'])
        if return_useridx:
            return X, XX_uidx, y
        else:
            return X, y


def get_features(dict_,eid, timestamps, threshold=90, resolution='day', sep=False, read_text=False,
                 text_seq=None, embeddings_index=None, stopwords=None, read_user=False,
                 doc2vec_model=None, user_feature=None, user2ind=None, user_list=None,
                 cutoff=50, return_useridx=True):
    '''
    timestamps
        : relative timestamps since the first tweet
        : it should be sorted.
        : unit = second
    unit of threshold and resolution should be matched.
    '''
    ts = timestamps
    if resolution=='day':
        binsize = 3600*24
    elif resolution=='hour':
        binsize = 3600
    elif resolution=='minute':
        binsize = 60
    ts2 = sorted(ts)
    cnt, bins = np.histogram(ts2, bins=range(ts2[0],ts2[0]+threshold*binsize,binsize))
                             
    nonzero_bins_ind = np.nonzero(cnt)[0]
    nonzero_bins = bins[nonzero_bins_ind]
    
    hist = cnt[nonzero_bins_ind]
    inv = nonzero_bins_ind[1:]-nonzero_bins_ind[:-1]
    intervals = np.insert(inv,0,0)
    ### Cutoff sequence
#     cutoff = 50
    if len(hist)>cutoff:
        hist = hist[:cutoff]
        intervals = intervals[:cutoff]
        nonzero_bins = nonzero_bins[:cutoff]

    ### user feature   
    if read_user:
        X_useridx = []
        for bid, bin_left in enumerate(nonzero_bins):
            bin_userlist = []
            bin_right = bin_left + binsize
            try:
                del temp
            except:
                pass
            # Collecting text to make doc
            for tid, t in enumerate(ts):
                if t<bin_left:
                    continue
                elif t>=bin_right:
                    break
                else:
                    pass
                uid = user2ind[user_list[tid]]
                bin_userlist.append(user_list[tid])
                coef = user_feature[uid,:].reshape(1,-1)   # (1,n_components)
                try:
                    temp = np.concatenate((temp, coef), axis=0)
                except:
                    temp = coef

            X_user_bin = np.mean(temp, axis=0).reshape(1,-1)

            try:
                X_user = np.concatenate((X_user, X_user_bin), axis=0)
            except:
                X_user = X_user_bin
            X_useridx.append(bin_userlist)

    ### text feature
    if read_text:
        text_matrix = get_doc2vec(dict_,doc2vec_model, eid, nonzero_bins)
    
    if sep:
        if read_text:
            return hist, intervals, X_user, text_matrix
        else:
            return hist, intervals, X_user
    else:
        if read_text and read_user:
            if return_useridx:
                return np.hstack([hist.reshape(-1,1), intervals.reshape(-1,1), X_user, text_matrix]), X_useridx
            else:
                return np.hstack([hist.reshape(-1,1), intervals.reshape(-1,1), X_user, text_matrix])
        elif read_text or read_user:
            if read_text:
                return np.hstack([hist.reshape(-1,1), intervals.reshape(-1,1), text_matrix])
            elif read_user:
                if return_useridx:
                    return np.hstack([hist.reshape(-1,1), intervals.reshape(-1,1), X_user]), X_useridx
                else:
                    return np.hstack([hist.reshape(-1,1), intervals.reshape(-1,1), X_user])
        else:
            return np.hstack([hist.reshape(-1,1), intervals.reshape(-1,1)])
    
def get_doc2vec(dict_,doc2vec_model, eid, nonzero_bins):
    X_text = np.array([])
    if isinstance(eid, int):
            eid_str = str(eid)
    else:
        eid_str = eid
    ts = np.array(dict_[eid_str]['timestamps'], dtype=np.int32)
    text_seq= np.array(dict_[eid_str]['text'])
    for bid, bin_left in enumerate(nonzero_bins):
#         tag = eid_str+'_'+str(bid)
#         temp = doc2vec_model.docvecs[tag] # (300,)
        bin_right = bin_left + binsize
        doc = ''
        for tid, t in enumerate(ts):
            if t<bin_left:
                continue
            elif t>=bin_right:
                break
            else:
                pass
            string = text_seq[tid]
            string = re.sub(r"http\S+", "", string)
            string = re.sub("[?!.,:;()'@#$%^&*-=+/\[\[\]\]]", ' ', string) # !.,:;()'@#$%^&*-_{}=+/\"
            doc += string
        temp = doc2vec_model.infer_vector(jieba.lcut(doc))
        temp = temp.reshape(1,-1)
        if X_text.shape[0] != 0:
            X_text = np.concatenate((X_text, temp), axis=0)
        else:
            X_text = temp
    return X_text
    
### Building Model
LOAD_MODEL = False
task = "classification"  #"classification"

#train
scaler_dict_train = {}
noerr_eid_list_train = set()
X_dict_train = {}
X_uidx_dict_train = {}
subX_dict_train = {}
y_dict_train = {}

#test
scaler_dict_test = {}
noerr_eid_list_test = set()
X_dict_test = {}
X_uidx_dict_test = {}
subX_dict_test = {}
y_dict_test = {}


nb_rumor = 0
burnin = 5 if task=="regression" else 0
read_text = True
read_user = True

In [62]:
#train
for eid in tqdm(eid_train):
    eid = str(eid)
    if read_user:
        X, X_uidx, y = create_dataset(train_dict_, eid, threshold=90*24, resolution='hour',
                                 read_text=read_text, embeddings_index=None, stopwords=None,
                                 doc2vec_model=doc_vectorizer, user_feature=user_feature_train[:,:nb_feature_main], 
                                 user2ind=user2ind_train, read_user=read_user, task=task, cutoff=50,
                                 return_useridx=True)
    else:
        X, y = create_dataset(train_dict_, eid, threshold=90*24, resolution='hour',
                                 read_text=read_text, embeddings_index=None, stopwords=None,
                                 doc2vec_model=doc_vectorizer, user_feature=user_feature_train[:,:nb_feature_main], 
                                 user2ind=user2ind_train, read_user=read_user, task=task, cutoff=50,
                                 return_useridx=False)

    X = X.astype(np.float32)
    if X.shape[0]<=2*burnin:  # ignore length<=1 sequence
        continue

    X_dict_train[eid] = X
    if read_user:
        X_uidx_dict_train[eid] = X_uidx
    subX_dict_train[eid] = get_user_feature_in_event(train_dict_, eid, u_sample_train, 
                                               user_feature_sub_train[:,:nb_feature_sub], user_sample2ind_train)
    y_dict_train[eid] = y

    try:
        scaler_dict_train[eid]
    except:
        scaler_hist = MinMaxScaler(feature_range=(0,1))
        scaler_hist.fit(X[:,0].reshape(-1,1))
        scaler_interval = MinMaxScaler(feature_range=(0,1))
        scaler_interval.fit(X[:,1].reshape(-1,1))
        scaler_dict_train[eid] = (scaler_hist, scaler_interval)
        
        
#test
for eid in tqdm(eid_test):
    eid = str(eid)
    if read_user:
        X, X_uidx, y = create_dataset(test_dict_, eid, threshold=90*24, resolution='hour',
                                 read_text=read_text, embeddings_index=None, stopwords=None,
                                 doc2vec_model=doc_vectorizer, user_feature=user_feature_test[:,:nb_feature_main], 
                                 user2ind=user2ind_test, read_user=read_user, task=task, cutoff=50,
                                 return_useridx=True)
    else:
        X, y = create_dataset(test_dict_, eid, threshold=90*24, resolution='hour',
                                 read_text=read_text, embeddings_index=None, stopwords=None,
                                 doc2vec_model=doc_vectorizer, user_feature=user_feature_test[:,:nb_feature_main], 
                                 user2ind=user2ind_test, read_user=read_user, task=task, cutoff=50,
                                 return_useridx=False)

    X = X.astype(np.float32)
    if X.shape[0]<=2*burnin:  # ignore length<=1 sequence
        continue

    X_dict_test[eid] = X
    if read_user:
        X_uidx_dict_test[eid] = X_uidx
    subX_dict_test[eid] = get_user_feature_in_event(test_dict_, eid, u_sample_test, 
                                               user_feature_sub_test[:,:nb_feature_sub], user_sample2ind_test)
    y_dict_test[eid] = y

    try:
        scaler_dict_test[eid]
    except:
        scaler_hist = MinMaxScaler(feature_range=(0,1))
        scaler_hist.fit(X[:,0].reshape(-1,1))
        scaler_interval = MinMaxScaler(feature_range=(0,1))
        scaler_interval.fit(X[:,1].reshape(-1,1))
        scaler_dict_test[eid] = (scaler_hist, scaler_interval)

HBox(children=(IntProgress(value=0, max=455), HTML(value='')))

## CSI model
- Python : 2.7.x
- Keras : 1.2.1
- Theano : 0.9.0b1

In [64]:
'''
matrix_main is used for LSTM input.
matrix_sub is used for the scoring module.
'''

from keras.models import load_model
from keras.models import Model
from keras.optimizers import Adam
from keras.regularizers import l2
from keras.layers import Dense, Input, Dropout, Lambda, LSTM, Embedding, Conv1D, TimeDistributed, Add
from keras import regularizers
from keras.optimizers import Adam
from keras import backend as K

import tensorflow as tf
import keras
               
#configuration
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.Session(config=config)
keras.backend.set_session(session)

acc=0

nb_features = 2+10+100    # (#temporal, #user, #doc)
dim_hidden = 50

##### Main part #####
inputs = Input(shape=(None, nb_features))
emb_out = TimeDistributed(Dense(100, activation='tanh'))(inputs)    # W_e
emb_out = Dropout(0.2)(emb_out)
rnn_out = LSTM(dim_hidden, activation='tanh', return_sequences=False)(emb_out)    #(None, dim_hidden)
rnn_out = Dense(100, activation='tanh')(rnn_out)     # (None, 100) W_r
rnn_out = Dropout(0.2)(rnn_out)


##### Sub part #####
nb_score = 1
nb_expand = 100
sub_input = Input(shape=(None, nb_feature_sub))
user_vec = TimeDistributed(Dense(nb_expand, activation='tanh',
                                 W_regularizer=regularizers.l2(0.01)))(sub_input)   # (None, None, nb_expand)
sub_h = TimeDistributed(Dense(nb_score, activation='sigmoid'))(user_vec)    # (None, None, nb_score)
z = Lambda(lambda x: K.mean(x, axis=1), output_shape=lambda s: (s[0], s[2]))(sub_h)    #(None, nb_score)

##### Concatenate #####
out1 = Dense(1, activation='sigmoid')(rnn_out)
concat_out = Add()([out1, z])
# concat_out = merge([rnn_out, z], mode='concat', concat_axis=1)
# concat_out = concatenate([rnn_out, z], axis=1)

##### Classifier #####
# outputs = Dense(1, activation='sigmoid')(concat_out)
# outputs = Dense(1, activation='sigmoid')(concat_out)
outputs = concat_out


##### Model #####
hvector = Model(input=[inputs, sub_input], output=concat_out)
zscore = Model(input=sub_input, output=sub_h)
model = Model(input=[inputs, sub_input], output=outputs)
uvector = Model(input=sub_input, output=user_vec)
# model = Model(input=inputs, output=outputs)


##### Compile #####
adam = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
if task=="regression":
    model.compile(optimizer=adam,
                  loss='mean_squared_error')
elif task=="classification":
    model.compile(optimizer=adam,
                  loss='binary_crossentropy')
print("Model is compiled.")

Model is compiled.




In [65]:
len(X_dict_train.keys()) , len(X_dict_test.keys())

(3777, 455)

## Training

In [66]:
from sklearn.metrics import roc_curve, auc, accuracy_score, precision_score, confusion_matrix
from keras.models import load_model

tf.set_random_seed(0)
def sigmoid_array(x):                                        
    return 1 / (1 + np.exp(-x))

### Training... ###
# acc = 0
nb_epoch = 40
y_val = [int(dict_[str(eid)]['label']) for eid in eid_val]
y_val = np.array(y_val) > 0.5
train_acc = 0
val_acc = 0
    
for ep in tqdm(range(nb_epoch)):
    ##### Looping for eid_train #####
    train_losses = []
    train_preds = []
    y_train = []
    for eid in tqdm(eid_train):
        if X.shape[0]<=2*burnin:  # ignore length<=1 sequence
            continue
        eid = str(eid)
        X = X_dict_train[eid]
        X = X.astype(np.float32)
        y = y_dict_train[eid]

        label = int(train_dict_[eid]['label'])
        noerr_eid_list_train.add(eid)
        sh = scaler_dict_train[eid][0]
        si = scaler_dict_train[eid][1]
        
        ##### Main input #####
        trainX = X
        ##### Sub input #####
        sub_trainX = subX_dict_train[eid]
        
        trainY = y
        dim_output = 1
        y_train.append(y)

        h = model.fit([trainX[np.newaxis,:,:], sub_trainX[np.newaxis,:,:]], np.array([trainY]), 
                      batch_size=1, epochs=1, verbose=0)
        pred = model.predict([trainX[np.newaxis,:,:], sub_trainX[np.newaxis,:,:]],batch_size=1,verbose=0)
        train_preds.append(pred[0,0])
        train_losses.append(h.history['loss'][0])

    train_preds = np.array(train_preds)
    train_preds = train_preds>0.5
    y_train = np.array(y_train) > 0.5
    
    ### Evaluation On Validation###
    val_preds = []
    for eid in eid_val:
        if X.shape[0]<=2*burnin:  # ignore length<=1 sequence
            continue
        eid = str(eid)
        X = X_dict[eid]
        X = X.astype(np.float32)
        y = y_dict[eid]
        valX = X
        sub_valX = subX_dict[eid]
        
        pred = model.predict([np.array([valX]), np.array([sub_valX])], verbose=0)
        val_preds.append(pred[0,0])
    
    val_preds = np.array(val_preds)
    val_preds = val_preds > 0.5
    
    
    #results
    train_acc = np.mean(train_preds == y_train)
    val_acc = np.mean(val_preds == y_val)
    print("train loss:",np.mean(train_losses),' train acc:',train_acc,' val acc':val_acc)

HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3777), HTML(value='')))

train loss: 0.9409028450576976  train acc: 0.7866031241726238


HBox(children=(IntProgress(value=0, max=3777), HTML(value='')))

train loss: 0.45206826486271434  train acc: 0.9144823934339423


HBox(children=(IntProgress(value=0, max=3777), HTML(value='')))

train loss: 0.36045671918034666  train acc: 0.9144823934339423


HBox(children=(IntProgress(value=0, max=3777), HTML(value='')))

train loss: 0.3033729295231755  train acc: 0.9189833200953137


HBox(children=(IntProgress(value=0, max=3777), HTML(value='')))

train loss: 0.30972319877730387  train acc: 0.924013767540376


HBox(children=(IntProgress(value=0, max=3777), HTML(value='')))

train loss: 0.2755113806198767  train acc: 0.9229547259729944


HBox(children=(IntProgress(value=0, max=3777), HTML(value='')))

train loss: 0.2597500276099619  train acc: 0.9229547259729944


HBox(children=(IntProgress(value=0, max=3777), HTML(value='')))

train loss: 0.2599901805651789  train acc: 0.9197776012708498


HBox(children=(IntProgress(value=0, max=3777), HTML(value='')))

train loss: 0.2334732790412231  train acc: 0.9356632247815727


HBox(children=(IntProgress(value=0, max=3777), HTML(value='')))

train loss: 0.24174157251387654  train acc: 0.9343394228223457


HBox(children=(IntProgress(value=0, max=3777), HTML(value='')))

train loss: 0.25804582966759365  train acc: 0.9131585914747153


HBox(children=(IntProgress(value=0, max=3777), HTML(value='')))

train loss: 0.22703588017782056  train acc: 0.938046068308181


HBox(children=(IntProgress(value=0, max=3777), HTML(value='')))

train loss: 0.24451553247802932  train acc: 0.931691818903892


HBox(children=(IntProgress(value=0, max=3777), HTML(value='')))

train loss: 0.23248746707273052  train acc: 0.9377813079163357


HBox(children=(IntProgress(value=0, max=3777), HTML(value='')))

train loss: 0.22818226208784592  train acc: 0.9417527137940164


HBox(children=(IntProgress(value=0, max=3777), HTML(value='')))

train loss: 0.22003234124221924  train acc: 0.9428117553613979


HBox(children=(IntProgress(value=0, max=3777), HTML(value='')))

train loss: 0.2034245376670691  train acc: 0.9459888800635425


HBox(children=(IntProgress(value=0, max=3777), HTML(value='')))

train loss: 0.20616405606289562  train acc: 0.9457241196716971


HBox(children=(IntProgress(value=0, max=3777), HTML(value='')))

train loss: 0.1971735328841271  train acc: 0.9457241196716971


HBox(children=(IntProgress(value=0, max=3777), HTML(value='')))

train loss: 0.21812261817846146  train acc: 0.9473126820227694


HBox(children=(IntProgress(value=0, max=3777), HTML(value='')))

train loss: 0.1983312834170123  train acc: 0.9465184008472333


HBox(children=(IntProgress(value=0, max=3777), HTML(value='')))

train loss: 0.19125956989753756  train acc: 0.949166004765687


HBox(children=(IntProgress(value=0, max=3777), HTML(value='')))

train loss: 0.19891435473869823  train acc: 0.9515488482922955


HBox(children=(IntProgress(value=0, max=3777), HTML(value='')))

train loss: 0.1956644687685215  train acc: 0.9515488482922955


HBox(children=(IntProgress(value=0, max=3777), HTML(value='')))

train loss: 0.19184887898530587  train acc: 0.9510193275086047


HBox(children=(IntProgress(value=0, max=3777), HTML(value='')))

train loss: 0.1969227563669456  train acc: 0.9494307651575324


HBox(children=(IntProgress(value=0, max=3777), HTML(value='')))

train loss: 0.19051790575681538  train acc: 0.9552554937781308


HBox(children=(IntProgress(value=0, max=3777), HTML(value='')))

train loss: 0.189438936998143  train acc: 0.9531374106433678


HBox(children=(IntProgress(value=0, max=3777), HTML(value='')))

train loss: 0.1993879346510601  train acc: 0.9547259729944401


HBox(children=(IntProgress(value=0, max=3777), HTML(value='')))

train loss: 0.20067511170733643  train acc: 0.9428117553613979


HBox(children=(IntProgress(value=0, max=3777), HTML(value='')))

train loss: 0.20430181301052186  train acc: 0.9422822345777072


HBox(children=(IntProgress(value=0, max=3777), HTML(value='')))

train loss: 0.18142007219020512  train acc: 0.95128408790045


HBox(children=(IntProgress(value=0, max=3777), HTML(value='')))

train loss: 0.17980969962332302  train acc: 0.9541964522107492


HBox(children=(IntProgress(value=0, max=3777), HTML(value='')))

train loss: 0.1946598623787034  train acc: 0.95816785808843


HBox(children=(IntProgress(value=0, max=3777), HTML(value='')))

train loss: 0.17812044422250947  train acc: 0.9597564204395023


HBox(children=(IntProgress(value=0, max=3777), HTML(value='')))

train loss: 0.17539752887077145  train acc: 0.9534021710352132


HBox(children=(IntProgress(value=0, max=3777), HTML(value='')))

train loss: 0.17958060772939116  train acc: 0.9592268996558115


HBox(children=(IntProgress(value=0, max=3777), HTML(value='')))

train loss: 0.17800284877060948  train acc: 0.9605507016150384


HBox(children=(IntProgress(value=0, max=3777), HTML(value='')))

train loss: 0.17745041604880477  train acc: 0.9589621392639661


In [67]:
# Evaluate on Test
test_preds = []
y_test = [int(test_dict_[str(eid)]['label']) for eid in eid_test]
y_test = np.array(y_test) > 0.5

for eid in tqdm(eid_test):
        if X.shape[0]<=2*burnin:  # ignore length<=1 sequence
            continue
        eid = str(eid)
        X = X_dict_test[eid]
        X = X.astype(np.float32)
        y = y_dict_test[eid]

        testX = X
        sub_testX = subX_dict_test[eid]
        pred = model.predict([np.array([testX]), np.array([sub_testX])], verbose=0)
        test_preds.append(pred[0,0])

test_preds = np.array(test_preds)
test_preds = test_preds>0.5
tn, fp, fn, tp = confusion_matrix(y_test, test_preds).ravel()
precision_real = tp/(tp+fp)
precision_fake = tn/(fn+tn)
recall_real = tp/(tp+fn)
recall_fake = tn/(fp+tn)
test_acc = (tp+tn)/(tp+tn+fp+fn)
fscore_real = 2*tp/(2*tp+fp+fn)
fscore_fake = 2*tn/(2*tn+fp+fn)

ft = open('csi_accuracy_weibo.txt','w')
result = {'train_acc':train_acc,'test_acc':test_acc,'val_acc':val_acc,'precision_real':precision_real,'precision_fake':precision_fake,'recall_real':recall_real,'recall_fake':recall_fake,'fscore_real':fscore_real,'fscore_fake':fscore_fake}
print(str(result))
json.dump(str(result),ft)
ft.close()

HBox(children=(IntProgress(value=0, max=455), HTML(value='')))

{'precision_fake': 0.7321428571428571, 'fscore_real': 0.7714987714987716, 'precision_real': 0.8971428571428571, 'val': 0.9120370370370371, 'recall_fake': 0.9192825112107623, 'fscore_fake': 0.8151093439363817, 'recall_real': 0.6767241379310345, 'te': 0.7956043956043956, 'tr': 0.9589621392639661}
