In [1]:
import os
import sys
import numpy as np
import pandas as pd

import sklearn
import networkx as nx

import matplotlib
import matplotlib.pyplot as plt
# plt.style.use('seaborn-paper')
import seaborn as sns
matplotlib.rc('text', usetex=True)

import time, datetime
import pickle

In [21]:
data_name = 'alpha'
with open('../rev2data/%s/%s_networkv2.pkl' %(data_name, data_name), 'rb') as file:
    nodes, edges = pickle.load(file)
    G = nx.Graph()
    G.add_nodes_from(nodes)
    G.add_edges_from(edges)
print(edges[0])
print(nodes[1001])

('u1810', 'p13', {'timestamp': 1353128400.0, 'weight': 0.1, 'fairness': 1.0})
('p2364', {'goodness': 1.0})


In [13]:
for e in edges:
    if e[0] == 'u7188':
        print(e)

('u7188', 'p1', {'timestamp': 1407470400.0, 'weight': 1.0, 'fairness': 1.0})


In [17]:
def load_data(data_name):
    data_list = ['alpha', 'amazon', 'epinions', 'otc']
    assert data_name in data_list
    network_df = pd.read_csv('../rev2data/%s/%s_network.csv' %(data_name, data_name), header=None, names=['src', 'dest', 'rating', 'timestamp'], parse_dates=[3], infer_datetime_format=True)
    gt_df = pd.read_csv('../rev2data/%s/%s_gt.csv' %(data_name, data_name), header=None, names=['id', 'label'])
    if data_name in ['alpha', 'amazon', 'epinions', 'otc']:
        network_df['timestamp'] = pd.to_datetime(network_df['timestamp'], unit='s')
    return network_df, gt_df

In [18]:
network_df, gt_df = load_data('alpha')

print(network_df.shape)
display(network_df.head())

print(gt_df.shape)
display(gt_df.head())

print('rating')
display(network_df['rating'].describe())

(24186, 4)


Unnamed: 0,src,dest,rating,timestamp
0,7188,1,10,1407470000.0
1,430,1,10,1376539000.0
2,3134,1,10,1369714000.0
3,3026,1,10,1350014000.0
4,3010,1,10,1347854000.0


(240, 2)


Unnamed: 0,id,label
0,3,1
1,335,1
2,28,1
3,336,1
4,333,1


rating


count    24186.000000
mean         1.463946
std          2.903656
min        -10.000000
25%          1.000000
50%          1.000000
75%          2.000000
max         10.000000
Name: rating, dtype: float64

In [41]:
user_list = network_df['src'].unique().tolist()
prod_list = network_df['dest'].unique().tolist()
rev_per_prod = network_df.shape[0]/len(prod_list)
rating_dict = network_df.groupby('dest')['rating'].mean().to_dict()
count_dict = network_df.groupby('dest')['rating'].count().to_dict()
std_dict = network_df.groupby('dest')['rating'].std().fillna(0).to_dict()

rating_max = network_df['rating'].max()
rating_min = network_df['rating'].min()

print('users %d' %len(user_list))
print('products %d' %len(prod_list))
print('reviews %d' %network_df.shape[0])
print('reviews/prod %.2f' %rev_per_prod)
print('min max %.2f %.2f' %(rating_min, rating_max))

users 3286
products 3754
reviews 24186
reviews/prod 6.44
min max -10.00 10.00


In [80]:
# target
np.random.seed(29)
T_index = np.random.randint(len(prod_list))
T = network_df['dest'][T_index]
# K sockpuppets
# k is the index of the test
k = 1
K = int(k * count_dict[T] / 10)
# K = 1
# N geniune reviews for each sockpuppets
N = 0

print('target product', T)
print('current avg rating', rating_dict[T])
print('num of rating', count_dict[T])

print('generate %d socks' %K)
print('%d reviews per sock' %N)

target product 5
current avg rating 2.671232876712329
num of rating 146
generate 1 socks
0 reviews per sock


In [81]:
def generate_sockpuppets(base_index=0, num=1):
    socks = np.arange(base_index, base_index+num).tolist()
    return socks

np.random.seed(0)
def generate_reviews(user, prod, prod_list, num):
    fr = rating_max
    if rating_dict[prod] > 0:
        fr = rating_min
    reviews = [[user, prod, fr, pd.datetime.now()]]
    fr_prods = np.random.permutation(prod_list)[:num]
    reviews += [[user, p, np.clip(np.random.normal(rating_dict[p], std_dict[p], 1)[0], a_min=rating_min, a_max=rating_max), pd.datetime.today()] for p in fr_prods]
    return reviews

socks = generate_sockpuppets(len(user_list), K)

fake_data = []
for sock in socks:
    fake_reviews = generate_reviews(sock, T, prod_list, N)
    fake_data += fake_reviews

In [82]:
fake_df = pd.DataFrame(fake_data, columns=['src', 'dest', 'rating', 'timestamp'])
display(fake_df.head(N))
print(fake_df.shape)

fake_list = pd.DataFrame({'socks': socks, 'value':-1})
display(fake_list.head())

Unnamed: 0,src,dest,rating,timestamp


(1, 4)


Unnamed: 0,socks,value
0,3286,-1


In [73]:
df = pd.concat([network_df, fake_df])
df['fairness'] = 1
df['src'] = 'u' + df['src'].astype(str)
df['dest'] = 'p' + df['dest'].astype(str)
df['weight'] = (df['rating'] - rating_min)/(rating_max - rating_min) * 2 - 1
display(df.shape)
display(df.head())

new_rating_dict = {'p'+str(p): (rating_dict[p]-rating_min)/(rating_max-rating_min)*2-1 for p in rating_dict}

G = nx.from_pandas_edgelist(df, 'src', 'dest', ['weight', 'timestamp', 'fairness'], create_using=nx.DiGraph())
print('number of totdal nodes', len(G.nodes))
for node in G.nodes:
    if node.startswith('u'):
        G.node[node]['fairness'] = 1
    else:
        G.node[node]['goodness'] = new_rating_dict[node]
print(G.node['u7188'])
print(G.node['p1'])
print(G.edges['u7188', 'p1'])

(27398, 6)

Unnamed: 0,src,dest,rating,timestamp,fairness,weight
0,u7188,p1,10.0,1407470000.0,1,1.0
1,u430,p1,10.0,1376540000.0,1,1.0
2,u3134,p1,10.0,1369710000.0,1,1.0
3,u3026,p1,10.0,1350010000.0,1,1.0
4,u3010,p1,10.0,1347850000.0,1,1.0


number of totdal nodes 7229
{'fairness': 1}
{'goodness': 0.1904522613065327}
{'weight': 1.0, 'timestamp': 1407470400.0, 'fairness': 1}


In [74]:
# nx.gpickle.write_gpickle(G, 'test.pkl')
# GG = nx.gpickle.read_gpickle('test.pkl')

{'goodness': 0.1904522613065327}