In [174]:
import numpy as np
from scipy import sparse
import csv
import math
import collections

In [175]:
def Random_Split(split_per, fake_reviewers, genuine_reviewers):
    
    minimum = min(split_per * fake_reviewers.shape[0], split_per * genuine_reviewers.shape[0])
    
    #split for spammers
    fake_split_percentage = round(minimum)
    fakes_rev = np.random.permutation(fake_reviewers)
    labeled_spammers = fakes_rev[0:fake_split_percentage]
    
    #split for non-spammers
    genuine_split_percentage = round(minimum)
    genuine_rev = np.random.permutation(genuine_reviewers)
    labeled_nonspammers = genuine_rev[0:genuine_split_percentage]
    
    return (labeled_spammers, labeled_nonspammers)

In [176]:
def AdjacencyList(metadata_filename):
    '''
        Extracts adjacency list from metadata file
        Args:
            metadata_filename: contains [user_id, prod_id, rating, ...]
        Returns:
            adjacency list [user_id, prod_id, rating]
    '''
    
    fid = open(metadata_filename, 'r')
    data = fid.readlines()
    adjlist = np.zeros((len(data), 4))
    
    i = 0
    count_fake_review = 0
    count_genuine_review = 0
    
    fake_reviewers = np.zeros(len(data))
    genuine_reviewers = np.zeros(len(data))
    for edge in data:
        items = edge.strip().split()
        adjlist[i, 0] = float(items[0])
        adjlist[i, 1] = float(items[1])
        adjlist[i, 2] = float(items[2])
        adjlist[i, 3] = float(items[3])
        
        if(float(items[3]) == -1):
            fake_reviewers[count_fake_review] = adjlist[i, 0]
            count_fake_review = count_fake_review + 1
            
        i = i + 1
    #adjlist = np.zeros((14, 3))
    #with open(metadata_filename, 'rt') as csvfile:
    #    csvreader = csv.reader(csvfile)
    #    i = 0
    #    for edge in csvreader:
    #        adjlist[i, 0] = float(edge[0])
    #        adjlist[i, 1] = float(edge[1])
    #        adjlist[i, 2] = float(edge[2])
    #        i = i + 1

    reviewers = np.unique(adjlist[:, 0])
    f_reviewers = np.unique(fake_reviewers[0:count_fake_review])
    g_reviewers = np.zeros(reviewers.shape[0] - f_reviewers.shape[0])
    count = 0
    for user in reviewers:
        if not user in f_reviewers:
            g_reviewers[count] = user
            count = count + 1
    

    return (adjlist, f_reviewers, g_reviewers)

#AdjacencyList('metadata')

In [177]:
def ReshapePriors(priors):
    '''
        Reorders ids in ascneding order
        Example: ({2: 0.3 0.7, 1: 0.5 0.5, 3: 0.4 0.6}) --> ({1: 0.5 0.5, 2: 0.3 0.7, 3: 0.4 0.6})
        Args:
            [id, prior_belief, 1-prior_belief]
        Returns:
            [id, prior_belief, 1-prior_belief] in ascending order of ids
    '''
    od = collections.OrderedDict(sorted(priors.items()))
    
    u_ids = np.zeros((len(od), 1))
    i=0
    for elem in od.keys():
        u_ids[i] = elem
        i = i + 1
    
    u_vals = np.zeros((len(od), 1))
    i=0
    for elem in od.values():
        u_vals[i] = elem
        i = i + 1   
    
    u_priors = np.concatenate((u_ids, u_vals), axis = 1)
    unew_priors = np.concatenate((u_priors, np.ones((len(od), 1)) - u_vals), axis = 1)
    
    return unew_priors

#print(ReshapePriors({2:3, 1:89, 4:5, 3:0}))

In [178]:
"""
	Construct account, product and review features given a review dataset.
    Author: Sihong Xie
"""

import numpy as np

import codecs
import re
import string
from datetime import datetime

import json
import pickle
import gzip
import sys
sys.path.insert(0, '../')
from iohelper import *

date_time_format_str = '%Y-%m-%d'



def PR_NR(data):
	"""
		Ratio of positive and negative reviews of a user or product
		Args:
			data is a dictionary with key=u_id or p_id and value = tuples of (neighbor id, rating, label, posting time)
		Return:
			dictionary with key = u_id or p_id and value = (PR, NR)
	"""
	feature = {}

	for i, d in data.items():
		positives = [1 for t in d if t[1] > 3]
		negatives = [1 for t in d if t[1] < 3]
		feature[i] = (float(len(positives)) / len(d), float(len(negatives)) / len(d))
	return feature

def avgRD_user(user_data, product_data):
    """
        Average rating deviation of each user / product.
        For a user i, avgRD(i) = average(r_ij - avg_j | for all r_ij of the user i)
        For a product j, avgRD(j) = average(r_ij - avg_j | for all r_ij of the user i) = 0!?
        Return:
            average rating deviation on users, as defined in the paper
            Detecting product review spammers using rating behaviors, CIKM, 2010
    """
    # find the average rating of each product
    p_avg = {}
    for i, d in product_data.items():
        p_avg[i] = np.mean(np.array([t[1] for t in d]))

    # find average rating deviation of each user
    u_avgRD = {}
    for i, d in user_data.items():
        u_avgRD[i] = np.mean(np.array([abs(t[1] - p_avg[t[0]]) for t in d]))

    return u_avgRD

def ERD(data):
	"""
		Entropy of the rating distribution of each user (product)
	"""
	erd = {}
	for i, d in data.items():
		ratings = [t[1] for t in d]
		h, _ = np.histogram(ratings, bins = np.arange(1,7))
		h = h / h.sum()
		h = h[np.nonzero(h)]
		erd[i] = (- h * np.log2(h)).sum()
	return erd



def RD(product_data):
	"""Calculate the deviation of the review ratings to the product average.
		
		Args:
			prod_data:
		Return:
			a dictionary with key = (u_id, p_id), value = deviation of the rating of this review to the average rating of the target product
	"""
	rd = {}
	for i, d in product_data.items():
		avg = np.mean(np.array([t[1] for t in d]))
		for t in d:
			rd[(t[0], i)] = abs(t[1] - avg)
	return rd

def EXT(product_data):
	"""
		Whether a rating is extreme or not
		Args:
			product_data is a dictionary with key=p_id and value = tuples of (u_id, rating, label, posting time)
		Return:
			a dictionary with key = (u_id, p_id) and value = 0 (not extreme) / 1 (extreme)
	"""
	ext = {}
	for i, d in product_data.items():
		for t in d:
			if int(t[1]) == 5 or int(t[1]) == 1:
				ext[(t[0], i)] = 1
			else:
				ext[(t[0], i)] = 0
	return ext

def DEV(product_data):
	"""
		Deviation of each rating from the average rating of the target product.
		Need to use "recursive minimal entropy partitioning" to find beta_1
		Args:
			product_data is a dictionary with key=p_id and value = tuples of (neighbor id, rating, label, posting time)
		Return:
			a dictionary with key = (u_id, p_id) and value = (RD_ij, RD_ij / 4 > 0.63 ? 1: 0)
			where RD_ij = |r_ij - average rating of product j|
	"""
	beta_1 = 0.63
	dev = {}
# i is a product id
	for i, d in product_data.items():
		# find the average rating of each product
		p_avg_rating = np.mean(np.array([t[1] for t in d]))
		for t in d:
			u_id = t[0]	# user id
			if (abs(p_avg_rating - t[1]) / 4.0 > 0.63):
				dev[(u_id, i)] = 1	# absolute difference between current rating and product average rating
			else:
				dev[(u_id, i)] = 0	# absolute difference between current rating and product average rating
	return dev



def ISR(user_data):
	"""
		Check if a user posts only one review
	"""
	isr = {}
	for i, d in user_data.items():
		# go through all review of this user
		for t in d:
			if len(d) == 1:
				isr[(i, t[0])] = 1
	return isr



def read_graph_data(metadata_filename):
    
    user_data = {}
    
    prod_data = {}

    # use the rt mode to read ascii strings instead of binary
    with open(metadata_filename, 'rt') as f:
        # file format: each line is a tuple (user id, product id, rating, label, date)
        for line in f:
            items = line.strip().split()
            u_id = items[0]
            p_id = items[1]
            rating = float(items[2])
            label = int(items[3])
            date = items[4]


            if u_id not in user_data:
                user_data[u_id] = []
            user_data[u_id].append((p_id, rating, label, date))

            if p_id not in prod_data:
                prod_data[p_id] = []
            prod_data[p_id].append((u_id, rating, label, date))

	# read text feature files, including: wordcount, ratio of SW/OW, etc.
	# constructed by the python files provided by the authors.

    print ('number of users = %d' % len(user_data))
    print ('number of products = %d' % len(prod_data))
    return user_data, prod_data

def add_feature(existing_features, new_features, feature_names):
	"""
		Add feature(s) of a set of nodes of the same type to the existing feature(s).
		Args:
			existing_features: a dictionary {node_id:dict{feature_name:feature_value}}
			new_features: new feature(s) to be added. A dict {node_id: list of feature values}
			feature_names: the name of the new feature. A list of feature names, in the same order of the list of feature values in new_features
	"""

	for k, v in new_features.items():
		# k is the node id and v is the feature value
		if k not in existing_features:
			existing_features[k] = dict()
		# add the new feature to the dict of the node
		for i in range(len(feature_names)):
			if len(feature_names) > 1:
				existing_features[k][feature_names[i]] = v[i]
			else:
				existing_features[k][feature_names[i]] = v

#def construct_all_features(metadata_filename, text_feature_filename, user_feature_filename, prod_feature_filename, review_feature_filename):
def construct_all_features(user_data, prod_data, text_features):
    """
        Main entry to feature construction.
        Args:
            metadata_filename:
            text_feature_filename:
        Return:
            user, product and review features
    """
    
    # key = user id, value = dict of {feature_name: feature_value}
    UserFeatures={}
    # key = product id, value = dict of {feature_name: feature_value}
    ProdFeatures={}
    
# go through feature functions
    print ('\nadding user and product features......\n')
	#uf = MNR(user_data)
	#add_feature(UserFeatures, uf, ["MNR"])
	#pf = MNR(prod_data)
	#add_feature(ProdFeatures, pf, ["MNR"])
    
    uf = PR_NR(user_data)
    add_feature(UserFeatures, uf, ["PR", "NR"])
    pf = PR_NR(prod_data)
    add_feature(ProdFeatures, pf, ["PR", "NR"])
    
    uf = avgRD_user(user_data, prod_data)
    add_feature(UserFeatures, uf, ["avgRD"])

	#uf = BST(user_data)
	#add_feature(UserFeatures, uf, ["BST"])

    uf = ERD(user_data)
    add_feature(UserFeatures, uf, ["ERD"])
    pf = ERD(prod_data)
    add_feature(ProdFeatures, pf, ["ERD"])

	#uf = ETG(user_data)
	#add_feature(UserFeatures, uf, ["ETG"])
	#pf = ETG(prod_data)
	#add_feature(ProdFeatures, pf, ["ETG"])

    #MN: Jan 7, 2018 - we don't deal with text-based features
	#uf = RL(user_data, text_features)
	#add_feature(UserFeatures, uf, ['RL'])
	#pf = RL(prod_data, text_features, isUser = False)
	#add_feature(ProdFeatures, pf, ['RL'])

# go through review features
    print ('\nadding review features......\n')
    ReviewFeatures = {}
    rf = RD(prod_data)
    add_feature(ReviewFeatures, rf, ['RD'])

    rf = EXT(prod_data)
    add_feature(ReviewFeatures, rf, ['EXT'])

    rf = DEV(prod_data)
    add_feature(ReviewFeatures, rf, ['DEV'])

	#rf = ETF(prod_data)
	#add_feature(ReviewFeatures, rf, ['ETF'])

    rf = ISR(prod_data)
    add_feature(ReviewFeatures, rf, ['ISR'])

# add low level text features
	##print ('\nadding low level review features......\n')
	##for k, v in text_features.items():
	##	# k is a tuple (u_id, p_id)
	##	# v is a dict (key=feature name, value = feature value)
	##	names = [name for name, value in v.items()]
	##	values = [value for name, value in v.items()]

# recall that the arguments to add_feature
	##	add_feature(ReviewFeatures, {k:values}, names)

    return UserFeatures, ProdFeatures, ReviewFeatures

def calculateNodePriors(node_features, when_suspicious):
	"""
		Calculate priors of nodes P(y=1|node) using node features.
		Args:
			node_features: a dictionary with key = node_id and value = dict of feature_name:feature_value
			when_suspicious: a dictionary with key = feature name and value = 'H' (the higher the more suspicious) or 'L' (the opposite)
		Return:
			A dictionary with key = node_id and value = S score (see the SpEagle paper for the definition)
	"""
# return value
	priors = {}

	feature_names = set()
	for k, v in node_features.items():
		for fn, fv in v.items():
			# only include those in when_suspicious
			if fn in when_suspicious:
				feature_names.add(fn)

	feature_names = list(feature_names)
	n_features = len(feature_names)
	
	print ('number of features: %d' % n_features)

	feature_values = {name:[] for name in feature_names}

	# go through the instances
	for k, v in node_features.items():
		# go through the features
		for fn in feature_names:
			if fn not in v:
				feature_values[fn].append(0)		# add 0 by defaulty
			else:
				feature_values[fn].append(v[fn])	# add the non-zero feature values

	
	for k,v in feature_values.items():
		assert len(v) == len(node_features), 'number of feature values is different from number of instances'

	#def ecdf(x):
	#    x = np.sort(x)
	#	def result(v):
	#			        return np.searchsorted(x, v, side='right') / x.size
	#				    return result
	
	# compute CDF for each feature
	all_cdfs = {}
	for fn in feature_names:
		print (fn)
		if fn not in feature_values:
			print (fn + ' not in feature values')
			continue
		bins = sorted(list(set(feature_values[fn])))
		#if fn == 'DEV':
		print (bins[0])
		if len(bins) == 1:
			print (fn + ' has only one value.')
# bin1=[a,b), bin2=[b,c), ... last_bin = [y,z]
		h, d = np.histogram(feature_values[fn], bins = bins)
		cdf = np.cumsum(h)
		normalizer= 1.0 / cdf[-1]
		cdf = normalizer * cdf

# for later lookup
		print (len(bins), len(d), len(cdf))
		all_cdfs[fn] = {d[i]:cdf[i] for i in range(1, len(d)-1)}
		all_cdfs[fn][d[0]] = 0	# the least value has cdf 0
		all_cdfs[fn][d[-1]] = 1	# the least value has cdf 0

	#print (list(all_cdfs['BST'].keys()))
# for each node, find the S score for each feature
	for k, v in node_features.items():
		s = 0
		for fn in feature_names:
			x = 0 if fn not in v else v[fn]
			c = all_cdfs[fn][x]
			fx = 1.0 - c if when_suspicious[fn] == '+' else c
			s += fx * fx
		s = 1 - np.sqrt(s / n_features)
		priors[k] = s
	
	return priors

In [179]:
def Priors(metadata_filename, u_data, p_data):

# path to the folder containing the files
	#prefix = '/home/sihong/data/yelp_reviews/'
	#prefix = '/Users/bon/Documents/data mining/datasets/yelp_reviews/nyc/'
    prefix = '/Users/anahita/datasets/YelpNYC/'

# raw data file names
    #metadata_filename = prefix + 'metadata'
    #review_filename = prefix + 'reviewContent'

# feature file names
    user_feature_filename = prefix + 'UserFeatures.pickle'
    prod_feature_filename = prefix + 'ProdFeatures.pickle'
    review_feature_filename = prefix + 'ReviewFeatures.pickle'

# prior file names
    user_prior_filename = prefix + 'UserPriors.pickle'
    prod_prior_filename = prefix + 'ProdPriors.pickle'
    review_prior_filename = prefix + 'ReviewPriors.pickle'

# output file names
    text_feature_filename = prefix + 'text_features.pickle'

# feature configuration
    feature_suspicious_filename = 'feature_configuration.txt'

# low level text features
	# just need to construct text features once
	#print ('Starting constructing low level text features\n')
	#low_level_text_features(review_filename, text_feature_filename)

	# can just load the constructed low level text features
	#tf = load_text_feature(text_feature_filename)
	#print ('Finished constructing low level text features\n')

# all high level features
    print ('Starting constructing high level user, product and review features\n')
    user_data = {}
    prod_data = {}
    if(metadata_filename != ' '):
        user_data, prod_data = read_graph_data(metadata_filename)
    else:
        user_data = u_data
        prod_data = p_data
        
    text_features = []
    #text_features = load_text_feature(text_feature_filename)
    UserFeatures, ProdFeatures, ReviewFeatures = construct_all_features(user_data, prod_data, text_features)
    with open(user_feature_filename, 'wb') as f:
        pickle.dump(UserFeatures, f)

    with open(prod_feature_filename, 'wb') as f:
        pickle.dump(ProdFeatures, f)

    with open(review_feature_filename, 'wb') as f:
        pickle.dump(ReviewFeatures, f)
    print ('Finished constructing high level user, product and review features\n')

# Priors
    print ('Start calculating user, product and review priors.\n')

    feature_config = load_feature_config(feature_suspicious_filename)
    print (feature_config)

    with open(user_feature_filename, 'rb') as f:
        user_features = pickle.load(f)
    user_priors = calculateNodePriors(user_features, feature_config)
    with open(user_prior_filename, 'wb') as f:
        pickle.dump(user_priors, f)

    with open(prod_feature_filename, 'rb') as f:
        prod_features = pickle.load(f)
    prod_priors = calculateNodePriors(prod_features, feature_config)
    with open(prod_prior_filename, 'wb') as f:
        pickle.dump(prod_priors, f)

    #with open(review_feature_filename, 'rb') as f:
    #    review_features = pickle.load(f)
    #review_priors = calculateNodePriors(review_features, feature_config)
    #with open(review_prior_filename, 'wb') as f:
    #    pickle.dump(review_priors, f)
    print ('Finished calculating user, product and review priors.\n')
    
    return (user_priors, prod_priors, user_data, prod_data)

In [180]:
'''
    implementation of ZooBP with its helper functions
'''
def Initialize_Final_Beliefs(N1, N2, m):
    '''
        Initialization of final beliefs
        Args:
            N1: number of users
            N2: number of products
            m: coefficient for reduction in beliefs
        Returns:
            Concatenation of initialized final beliefs for users and products
        Example of return values: -0.5 0.5 -0.3 0.3 ...
    '''
    r1 = m * (np.random.uniform(size=N1) - 0.5)
    r1 = r1.reshape(r1.shape[0], 1)
    r2 = m * (np.random.uniform(size=N2) - 0.5)
    r2 = r2.reshape(r2.shape[0], 1)
    B1 = np.concatenate((r1, -r1), axis = 1);
    B2 = np.concatenate((r2, -r2), axis = 1);
    
    temp1_B = B1.reshape((B1.shape[1]*B1.shape[0], 1))
    temp2_B = B2.reshape((B2.shape[1]*B2.shape[0], 1))
    B = np.concatenate((temp1_B, temp2_B), axis = 0)
   
    return B


def ZooBPPlus(a_list, u_priors, p_priors, fake_indices, nonfake_indices, ep, H):
    '''
        implementation of ZooBP in python
        Args:
            a_list: [user_id, prod_id, rating]
            u_priors: user priors in 2 classes: benign, spammer
            p_priors: prod priors in 2 classes: non-targeted, targeted
            fake_indices: indices of labeled spammers
            nonfake_indices: indices of labeled nonspammers
            ep: interaction strenght
            H: compatibility matrix
        Returns:
            final_user_beliefs: centered version of final user beliefs
            final_prod_beliefs ceneterd version of final prod beliefs
        NOTE:
            ZooBP requires consecutive ids not ids with gaps
    '''
    ##adjlist = AdjacencyList(metadata_filename)
    
    ##convert a_list to adjlist [user_id, prod_id, 1/2], where 1 indicates positive rating (4, 5) 
    ##and 2 indicates negative rating (1, 2, 3)
    adjlist = np.zeros((a_list.shape[0], 3))
    unq_user, user_tags = np.unique(a_list[:,0],return_inverse=1)
    unq_prod, prod_tags = np.unique(a_list[:,1],return_inverse=1)
    adjlist[:, 0] = user_tags
    adjlist[:, 1] = prod_tags

    adjlist[:, 0] = user_tags + np.ones(user_tags.shape[0])
    adjlist[:, 1] = prod_tags + np.ones(prod_tags.shape[0])
    adjlist[:, 2] = a_list[:, 2]
    
    
    rating = adjlist[:, 2]
    adjlist[rating == 1, 2] = -1
    adjlist[rating == 2, 2] = -1
    adjlist[rating == 3, 2] = -1
    adjlist[adjlist[:, 2] == -1, 2] = 2
    adjlist[rating == 4, 2] = -2
    adjlist[rating == 5, 2] = -2
    adjlist[adjlist[:, 2] == -2, 2] = 1
    rating = adjlist[:, 2]
    #print(adjlist[0:5, :])
    ##converts the given priors to the centered version 
    user_priors = u_priors - 0.5 * np.ones((u_priors.shape[0], 2))
    prod_priors = p_priors - 0.5 * np.ones((p_priors.shape[0], 2))
    
    ##finds positive (1) and negative (2) edges and reshapes them
    edges_pos = adjlist[rating == 1]
    edges_neg = adjlist[rating == 2]
    Lpos = edges_pos[:, 0:2]
    Lpos = Lpos.reshape((edges_pos.shape[0], 2))
    Lneg = edges_neg[:, 0:2]
    Lneg = Lneg.reshape((edges_neg.shape[0], 2))
    
    n_user = user_priors.shape[0]
    n_prod = prod_priors.shape[0]
    
    ##computes A+ and A- as defined in section 4.7 of ZooBP
    lpos_0 = Lpos[:,0] - np.ones(Lpos[:,0].shape[0])
    lpos_1 = Lpos[:,1] - np.ones(Lpos[:,1].shape[0])
    Apos = sparse.coo_matrix((np.ones(Lpos.shape[0]), (lpos_0, lpos_1)), shape=(n_user, n_prod))
    lneg_0 = Lneg[:,0] - np.ones(Lneg[:,0].shape[0])
    lneg_1 = Lneg[:,1] - np.ones(Lneg[:,1].shape[0])
    Aneg = sparse.coo_matrix((np.ones(len(Lneg)), (lneg_0, lneg_1)), shape=(n_user, n_prod))

    
    #prior beliefs are reshaped so that user1_belief 1-user1_belief ... prod1_belief 1-prod1_belief
    reshape_u = user_priors.reshape((2*n_user, 1))
    reshape_p = prod_priors.reshape((2*n_prod, 1))
    E = np.concatenate((reshape_u, reshape_p))
    
    #build P defined under section 4.7 of ZooBP
    R = sparse.kron(Apos-Aneg, ep*H)
    sp1 = sparse.identity(2*n_user)-sparse.identity(2*n_user)
    temp1 = sparse.hstack([sp1, 0.5 * R])
    sp2 = sparse.identity(2*n_prod)-sparse.identity(2*n_prod)
    temp2 = sparse.hstack([0.5*R.transpose(),sp2])
    P = sparse.vstack((temp1, temp2))
    P = P.transpose()

    
    
    #build Q defined under section 4.7 of ZooBP
    sum_temp = Apos + Aneg
    temp1 = sum_temp.sum(axis=1)
    temp2 = sum_temp.sum(axis=0)
    ##D12 = sparse.diags(temp1,0,(temp1.shape[0],temp1.shape[0]))
    ##D12 = sparse.csr_matrix(np.diagflat(temp1))
    temp11 = np.matrix(temp1)
    temp111 = temp11.tolist()
    flat_temp1 = [item for sublist in temp111 for item in sublist]
    D12 = sparse.diags(flat_temp1, 0)
    
    temp22 = np.matrix(temp2)
    temp222 = temp22.tolist()
    flat_temp2 = [item for sublist in temp222 for item in sublist]
    D21 = sparse.diags(flat_temp2, 0)
    #D21 = sparse.csr_matrix(np.diagflat(temp2))
    del temp11
    del temp111
    del temp22
    del temp222
    
    
    temp = 0.25 * ep * ep * sparse.kron(D12, H)
    Q_1 = sparse.eye(n_user * 2) + temp      
    Q_2 = sparse.eye(n_prod * 2) + (0.25 * ep * ep) * (sparse.kron(D21, H))
    sp1 = sparse.csr_matrix(np.zeros((n_user * 2, n_prod * 2)))
    Q_temp1 = sparse.hstack((Q_1, sp1))
    sp2 = sparse.csr_matrix(np.zeros((n_prod * 2, n_user * 2)))
    Q_temp2 = sparse.hstack((sp2, Q_2))
    Q = sparse.vstack((Q_temp1, Q_temp2))
    del temp
    del Q_1
    del Q_2
    del Q_temp1
    del Q_temp2
    
    #M
    
    M = P - Q + sparse.eye(2 * (n_user + n_prod))
    M = M.transpose()
    B = Initialize_Final_Beliefs(n_user, n_prod, 0.0001)
    
    #Iterative Solution
    res = 1
    while(res > 1e-8):
        Bold = B
        ##Equations (13) and (14) in ZooBP
        B = E + M * Bold
        
        #blocked users for fake indices
        for index in fake_indices:
            B[2*index] = 0.999 - 0.5
            B[2*index + 1] = 0.001 - 0.5
            
        #blocked users for nonfake indices
        for index in nonfake_indices:
            B[2*index] = 0.001 - 0.5
            B[2*index + 1] = 0.999 - 0.5         
            
        res = np.sum(np.sum(abs(Bold-B)))
    
    B1 = B[0:2*n_user,:]
    B2 = B[2*n_user:,:]
    user_beliefs= B1.reshape((n_user, 2))
    prod_beliefs= B2.reshape((n_prod, 2))
    
    return (user_beliefs, prod_beliefs)

In [181]:
def PreZooBP(metadata_filename):
    """
       
        Args:
            metadata_filename: adjacency list [user, product, rating, ...]
    """
    
    #adjacency list
    adjlist = AdjacencyList(metadata_filename)
    
    #user and product priors
    #pickle_user = open(user_priors_filename,"rb")
    #user_dict = pickle.load(pickle_user)
    #user_priors = np.zeros((6, 3))
    #user_priors[1, 0] = 1
    
    prefix = '/Users/anahita/datasets/YelpNYC/'
    
    print('Reshaping user and prod priors \n')
    Results = Priors(metadata_filename, [], [])
    u_priors = Results[0]
    user_priors = ReshapePriors(u_priors)
    
    p_priors = Results[1]
    prod_priors = ReshapePriors(p_priors)
    print('Done reshaping\n')
    
    user_data = Results[2]
    prod_data = Results[3]
    
    with open(prefix + 'userPriors.csv', 'w') as user_csvfile:
        writer = csv.writer(user_csvfile)
        for i in range(0, user_priors.shape[0]):
            writer.writerow(user_priors[i, :])

    with open(prefix + 'prodPriors.csv', 'w') as prod_csvfile:
        writer = csv.writer(prod_csvfile)
        for i in range(0, prod_priors.shape[0]):
            writer.writerow(prod_priors[i, :])
            
    with open(prefix + 'adjlist.csv', 'w') as adjlist_csvfile:
        writer = csv.writer(adjlist_csvfile)
        for i in range(0, adjlist.shape[0]):
            writer.writerow(adjlist[i, :])
            
    print('Done saving\n')

In [182]:
 def MainZooBPSemiSupervised(adjlist, labeled_spammers, labeled_nonspammers, ep, H):
    """
        Args:
            ep: interaction strength
            H: compatibility matrix
        Return:
            added edges
        Assumptions: 
            only one bad targeted product
    """
    #prefix = '/Users/anahita/datasets/YelpNYC/'
    #prefix = '/home/mnajaf2/ZooBP'
    #prefix = '/Users/anahita/Documents/U/U2016/UIC/Research/OpinionSpamDetection/Code/src/graph/ZooBP/'
    prefix = '/Users/anahita/Documents/U/U2016/UIC/Research/OpinionSpamDetection/Code/src/graph/ZooBP/ZooBPPySemiSupervisedPy/'
        
    ###user_priors = np.zeros((6, 3))
    ###prod_priors = np.zeros((4, 3))
    ###adjlist = np.zeros((14, 3))
    user_priors = np.zeros((160225, 3))
    prod_priors = np.zeros((923, 3))
    #adjlist = np.zeros((359052, 3))
    
    #split_per = 0.5
    
    #resultsA = AdjacencyList('metadata')
    #adjlist = resultsA[0]
    
    #what about prefix?
    ##i = 0
    ##with open(prefix + 'adjlist.csv', 'r') as adjlist_csvfile:
    ##    reader = csv.reader(adjlist_csvfile)
    ##    for row in reader:
    ##        adjlist[i, 0] = float(row[0])
    ##        adjlist[i, 1] = float(row[1])
    ##        adjlist[i, 2] = float(row[2])
    ##        i = i + 1
    #print(adjlist[0:10, :])
    
    
    ##results = Random_Split(split_per, resultsA[1], resultsA[2])
    ##labeled_spammers = results[0]
    ##labeled_nonspammers = results[1]
    
    i = 0
    with open(prefix + 'userPriors.csv', 'r') as user_csvfile:
        reader = csv.reader(user_csvfile)
        for row in reader:
            user_priors[i, 0] = float(row[0])
            user_priors[i, 1] = float(row[1])
            user_priors[i, 2] = float(row[2])
            i = i + 1
            
    #print(user_priors[0:10, :])
    u_priors = user_priors[np.argsort(user_priors[:, 0])]
    user_priors = u_priors
    
    #assign values for labeled data
    fake_indices = np.zeros(labeled_spammers.shape[0])
    for i in range(0, labeled_spammers.shape[0]):
        idx = np.where(user_priors[:, 0] == labeled_spammers[i])
        fake_indices[i] = idx[0]
    
    
    #1:spammer 2:benign
    for i in range(0, labeled_spammers.shape[0]):
        user_priors[fake_indices[i], 1] = .999
        user_priors[fake_indices[i], 2] = .001
    
    nonfake_indices = np.zeros(labeled_nonspammers.shape[0])
    for i in range(0, labeled_nonspammers.shape[0]):
        idx = np.where(user_priors[:, 0] == labeled_nonspammers[i])
        nonfake_indices[i] = idx[0]
    
    #1:spammer 2:benign
    for i in range(0, labeled_nonspammers.shape[0]):
        user_priors[nonfake_indices[i], 1] = .001
        user_priors[nonfake_indices[i], 2] = .999
        
    
    i = 0
    with open(prefix + 'prodPriors.csv', 'r') as prod_csvfile:
        reader = csv.reader(prod_csvfile)
        for row in reader:
            prod_priors[i, 0] = float(row[0])
            prod_priors[i, 1] = float(row[1])
            prod_priors[i, 2] = float(row[2])
            i = i + 1
    #print(prod_priors[0:10, :])
    p_priors = prod_priors[np.argsort(prod_priors[:, 0])]
    prod_priors = p_priors    
    
    #user_priors, prod_priors, ep, H
    print('Starting running SemiSupervised ZooBP\n')
    Results = ZooBPPlus(adjlist, user_priors[:, 1:], prod_priors[:, 1:], 
                        fake_indices, nonfake_indices, ep, H)
    user_beliefs = Results[0]
    print('Done running SemiSupervised ZooBP\n')
    
    #writing beliefs on the file
    final_beliefs = np.zeros((user_beliefs.shape[0], 3))
    final_beliefs[:, 0] = user_priors[:, 0]
    final_beliefs[:, 1:] = user_beliefs
    with open('ZooBPSemiSupervised.csv', 'w') as user_csvfile:
        writer = csv.writer(user_csvfile)
        for i in range(0, user_beliefs.shape[0]):
            writer.writerow(final_beliefs[i, :])
    