In [None]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
import os
from unidecode import unidecode
import pandas as pd
import numpy as np
from faker import Faker
from src.var import *
from tqdm import tqdm
from datetime import datetime

In [None]:
users = pd.read_pickle(USERS_DATA)
followers = pd.read_pickle(FOLLOWERS_DATA)

In [None]:
users_following = followers.groupby('source').agg({'target': 'count'}).index
users_followed = followers.groupby('target').agg({'source': 'count'}).index

users_filtered = users[users['user_id'].isin(users_following) & users['user_id'].isin(users_followed)]

users_filtered

In [None]:
Faker.seed(69)
np.random.seed(69)
fake = Faker(locale=['fr-FR', 'fr-CA'])

users_data = users_filtered.copy()

users_data['name'] = users_data['user_id'].apply(lambda x: fake.name())
users_data['username'] = users_data['name'].str.split(' ').str[0].str.lower().apply(unidecode).str.replace('-', '') + np.random.randint(0, 1000, len(users_data)).astype(str)
users_data['email'] = (users_data['username'] + '@').apply(lambda x: x + fake.free_email_domain())
users_data['birthday'] = users_data['user_id'].apply(lambda x: fake.date_between(start_date= datetime(1955, 1, 1), end_date=datetime(2005, 12, 30)))

users_data['credibility'] = np.random.beta(3.5, 1.4, len(users_data))
users_data['integrity'] = np.random.beta(3.5, 1.4, len(users_data))

users_data['follows'] = users_data['user_id'].apply(lambda x: followers[followers['source'] == x]['target'].values)

users_data

In [None]:
np.random.seed(69)

SCALE = 3
THRESHOLD = 1.5

users_trust = pd.merge(
    left=followers,
    right=users_data[['user_id', 'credibility', 'integrity']],
    left_on='source',
    right_on='user_id',
)
users_trust = users_trust.rename(columns={'credibility': 'source_credibility', 'integrity': 'source_integrity'})
users_trust = users_trust.drop(columns='user_id')

users_trust = pd.merge(
    left=users_trust,
    right=users_data[['user_id', 'credibility', 'integrity']],
    left_on='target',
    right_on='user_id',
)
users_trust = users_trust.rename(columns={'credibility': 'target_credibility', 'integrity': 'target_integrity'})
users_trust = users_trust.drop(columns='user_id')

users_trust = users_trust.drop(columns=['target_integrity', 'source_integrity'])

users_trust['diff'] = (users_trust['source_credibility'] - users_trust['target_credibility']).abs()
users_trust['prob_trust'] = np.exp(-users_trust['diff'] / SCALE)
users_trust['prob_distrust'] = users_trust['diff'] / THRESHOLD * np.exp(-users_trust['diff'] / SCALE)
users_trust['prob_neutral'] = (1 - users_trust['prob_trust'] - users_trust['prob_distrust']).clip(0, 1)
users_trust['prob_sum'] = users_trust[['prob_trust', 'prob_distrust', 'prob_neutral']].sum(axis=1)
users_trust['prob_trust'] /= users_trust['prob_sum']
users_trust['prob_distrust'] /= users_trust['prob_sum']
users_trust['prob_neutral'] /= users_trust['prob_sum']
users_trust['trust'] = users_trust.apply(lambda row: np.random.choice([1, -1, 0], p=[row['prob_trust'], row['prob_distrust'], row['prob_neutral']]), axis=1)

users_trust = users_trust[['source', 'target', 'trust']]

users_trust

In [None]:
users_data['trustedUsers'] = users_data['user_id'].apply(lambda user_id: users_trust[(users_trust['source'] == user_id) & (users_trust['trust'] == 1)]['target'].values)
users_data['distrustedUsers'] = users_data['user_id'].apply(lambda user_id: users_trust[(users_trust['source'] == user_id) & (users_trust['trust'] == -1)]['target'].values)

users_data.to_csv(USERS_DATA_CSV, index=False)

users_data

In [None]:
posts = pd.read_pickle(POSTS_DATA)

posts

In [None]:
np.random.seed(69)

posts_data = pd.merge(
    left=posts,
    right=users_data[['user_id', 'credibility']],
    on='user_id',
    how='inner'
)

posts_data['post_credibility'] = np.random.normal(posts_data['credibility'], 0.2).clip(0, 1)
posts_data = posts_data.drop(columns=['credibility'])

posts_data

In [None]:
np.random.seed(69)

reacted_post_data = posts_data.copy()

reacted_post_data['likedBy'] = None
reacted_post_data['dislikedBy'] = None
reacted_post_data['trustedBy'] = None
reacted_post_data['distrustedBy'] = None

for i, post in tqdm(posts_data.iterrows(), total=len(posts_data)):
    user = users_data[users_data['user_id'] == post['user_id']].iloc[0]
    user_followers = followers[followers['target'] == post['user_id']]['source']
    
    followers_react_count = int(np.clip(np.random.normal(len(user_followers) / 2, len(user_followers) / 4), 0, len(user_followers)))
    non_followers_react_count = int(np.clip(np.random.normal(len(user_followers), len(user_followers) / 2), 0, len(user_followers)))

    followers_react = user_followers.sample(n=followers_react_count)
    non_followers_react = users[~users['user_id'].isin(followers_react)].sample(n=non_followers_react_count)['user_id']
    users_react = pd.concat([followers_react, non_followers_react]).reset_index(drop=True)
    users_react = users_data[users_data['user_id'].isin(users_react)][['user_id', 'credibility', 'integrity']]
    
    users_react['credibility_diff'] = (users_react['credibility'] - post['post_credibility']).abs()
    users_react['likelihood_of_trust'] = 1 - users_react['credibility_diff']
    users_react['likelihood_of_liking'] = 1 - users_react['integrity'] * users_react['credibility_diff']
    users_react['trust'] = users_react['likelihood_of_trust'] > np.random.rand(len(users_react))
    users_react['like'] = users_react['likelihood_of_liking'] > np.random.rand(len(users_react))

    reacted_post_data.loc[[i], 'likedBy'] = [[users_react[users_react['like'] == True]['user_id'].values]]
    reacted_post_data.loc[[i], 'dislikedBy'] = [[users_react[users_react['like'] == False]['user_id'].values]]
    reacted_post_data.loc[[i], 'trustedBy'] = [[users_react[users_react['trust'] == True]['user_id'].values]]
    reacted_post_data.loc[[i], 'distrustedBy'] = [[users_react[users_react['trust'] == False]['user_id'].values]]

reacted_post_data['likedBy'] = reacted_post_data['likedBy'].str[0]
reacted_post_data['dislikedBy'] = reacted_post_data['dislikedBy'].str[0]
reacted_post_data['trustedBy'] = reacted_post_data['trustedBy'].str[0]
reacted_post_data['distrustedBy'] = reacted_post_data['distrustedBy'].str[0]

posts_data.to_csv(POSTS_DATA_CSV, index=False)

reacted_post_data