In [43]:
import os
import sys
# module_path = os.path.abspath(os.path.join('..'))
module_path = '/Users/charles-francoisst-cyr/Documents/Development/INSA_4IF_PLD-Smart/packages/dataset'
if module_path not in sys.path:
    sys.path.append(module_path)

In [44]:
import os
from unidecode import unidecode
import pandas as pd
import numpy as np
from faker import Faker
from src.var import *
from tqdm import tqdm
from datetime import datetime

In [45]:
users = pd.read_pickle(USERS_DATA)
followers = pd.read_pickle(FOLLOWERS_DATA)

In [46]:
users_following = followers.groupby('source').agg({'target': 'count'}).index
users_followed = followers.groupby('target').agg({'source': 'count'}).index

users_filtered = users[users['user_id'].isin(users_following) & users['user_id'].isin(users_followed)]

users_filtered = users_filtered.sample(n=100, random_state=69)
followers = followers[followers['source'].isin(users_filtered['user_id']) & followers['target'].isin(users_filtered['user_id'])]

users_filtered

Unnamed: 0,user_id
758,7888452
738,358845982
433,197903282
304,163629705
670,93906304
...,...
580,14150874
725,15099384
660,384685422
286,85432934


In [47]:
Faker.seed(69)
np.random.seed(69)
fake = Faker(locale=['fr-FR', 'fr-CA'])

users_data = users_filtered.copy()

users_data['name'] = users_data['user_id'].apply(lambda x: fake.name())
users_data['username'] = users_data['name'].str.split(' ').str[0].str.lower().apply(unidecode).str.replace('-', '') + np.random.randint(0, 1000, len(users_data)).astype(str)
users_data['email'] = (users_data['username'] + '@').apply(lambda x: x + fake.free_email_domain())
users_data['birthday'] = users_data['user_id'].apply(lambda x: fake.date_between(start_date= datetime(1955, 1, 1), end_date=datetime(2005, 12, 30)))

users_data['credibility'] = np.random.beta(3.5, 1.4, len(users_data))
users_data['integrity'] = np.random.beta(3.5, 1.4, len(users_data))

users_data['follows'] = users_data['user_id'].apply(lambda x: followers[followers['source'] == x]['target'].values)

users_data

Unnamed: 0,user_id,name,username,email,birthday,credibility,integrity,follows
758,7888452,Émile Mercier,emile54,emile54@hotmail.com,1978-07-28,0.787194,0.491461,"[16193542, 14203895]"
738,358845982,Sophie Marchand,sophie203,sophie203@gmail.com,1967-01-03,0.965551,0.388049,"[206923844, 43003845]"
433,197903282,Valérie Bernard-Gervais,valerie969,valerie969@yahoo.com,1992-07-02,0.544520,0.800116,"[43003845, 248224845]"
304,163629705,Marcelle Duchesne,marcelle619,marcelle619@hotmail.fr,1977-11-12,0.920210,0.787079,"[62609430, 121258930, 396721965]"
670,93906304,Nicolas Legault,nicolas602,nicolas602@yahoo.com,1988-06-04,0.954360,0.554540,[18069824]
...,...,...,...,...,...,...,...,...
580,14150874,Roland Arsenault-Denis,roland567,roland567@wanadoo.fr,1989-08-19,0.708332,0.476956,[]
725,15099384,Isaac Le Gay,isaac689,isaac689@gmail.com,2005-06-08,0.682367,0.440660,[]
660,384685422,Élodie Mendès,elodie733,elodie733@hotmail.com,1971-07-26,0.647809,0.940869,[]
286,85432934,Rémy Asselin-Boutin,remy405,remy405@tiscali.fr,1976-05-17,0.586111,0.465826,[]


In [48]:
np.random.seed(69)

SCALE = 3
THRESHOLD = 1.5

users_trust = pd.merge(
    left=followers,
    right=users_data[['user_id', 'credibility', 'integrity']],
    left_on='source',
    right_on='user_id',
)
users_trust = users_trust.rename(columns={'credibility': 'source_credibility', 'integrity': 'source_integrity'})
users_trust = users_trust.drop(columns='user_id')

users_trust = pd.merge(
    left=users_trust,
    right=users_data[['user_id', 'credibility', 'integrity']],
    left_on='target',
    right_on='user_id',
)
users_trust = users_trust.rename(columns={'credibility': 'target_credibility', 'integrity': 'target_integrity'})
users_trust = users_trust.drop(columns='user_id')

users_trust = users_trust.drop(columns=['target_integrity', 'source_integrity'])

users_trust['diff'] = (users_trust['source_credibility'] - users_trust['target_credibility']).abs()
users_trust['prob_trust'] = np.exp(-users_trust['diff'] / SCALE)
users_trust['prob_distrust'] = users_trust['diff'] / THRESHOLD * np.exp(-users_trust['diff'] / SCALE)
users_trust['prob_neutral'] = (1 - users_trust['prob_trust'] - users_trust['prob_distrust']).clip(0, 1)
users_trust['prob_sum'] = users_trust[['prob_trust', 'prob_distrust', 'prob_neutral']].sum(axis=1)
users_trust['prob_trust'] /= users_trust['prob_sum']
users_trust['prob_distrust'] /= users_trust['prob_sum']
users_trust['prob_neutral'] /= users_trust['prob_sum']
users_trust['trust'] = users_trust.apply(lambda row: np.random.choice([1, -1, 0], p=[row['prob_trust'], row['prob_distrust'], row['prob_neutral']]), axis=1)

users_trust = users_trust[['source', 'target', 'trust']]

users_trust

Unnamed: 0,source,target,trust
0,11784842,21391704,1
1,163629705,62609430,1
2,163629705,121258930,1
3,163629705,396721965,-1
4,80660928,11348282,1
...,...,...,...
112,16809036,12415722,1
113,21364753,12415722,-1
114,21364753,14848513,-1
115,11348282,12415722,1


In [49]:
users_data['trustedUsers'] = users_data['user_id'].apply(lambda user_id: users_trust[(users_trust['source'] == user_id) & (users_trust['trust'] == 1)]['target'].values)
users_data['distrustedUsers'] = users_data['user_id'].apply(lambda user_id: users_trust[(users_trust['source'] == user_id) & (users_trust['trust'] == -1)]['target'].values)

users_data.to_csv(USERS_DATA_CSV, index=False)

users_data

Unnamed: 0,user_id,name,username,email,birthday,credibility,integrity,follows,trustedUsers,distrustedUsers
758,7888452,Émile Mercier,emile54,emile54@hotmail.com,1978-07-28,0.787194,0.491461,"[16193542, 14203895]",[16193542],[14203895]
738,358845982,Sophie Marchand,sophie203,sophie203@gmail.com,1967-01-03,0.965551,0.388049,"[206923844, 43003845]","[206923844, 43003845]",[]
433,197903282,Valérie Bernard-Gervais,valerie969,valerie969@yahoo.com,1992-07-02,0.544520,0.800116,"[43003845, 248224845]","[43003845, 248224845]",[]
304,163629705,Marcelle Duchesne,marcelle619,marcelle619@hotmail.fr,1977-11-12,0.920210,0.787079,"[62609430, 121258930, 396721965]","[62609430, 121258930]",[396721965]
670,93906304,Nicolas Legault,nicolas602,nicolas602@yahoo.com,1988-06-04,0.954360,0.554540,[18069824],[18069824],[]
...,...,...,...,...,...,...,...,...,...,...
580,14150874,Roland Arsenault-Denis,roland567,roland567@wanadoo.fr,1989-08-19,0.708332,0.476956,[],[],[]
725,15099384,Isaac Le Gay,isaac689,isaac689@gmail.com,2005-06-08,0.682367,0.440660,[],[],[]
660,384685422,Élodie Mendès,elodie733,elodie733@hotmail.com,1971-07-26,0.647809,0.940869,[],[],[]
286,85432934,Rémy Asselin-Boutin,remy405,remy405@tiscali.fr,1976-05-17,0.586111,0.465826,[],[],[]


In [50]:
posts = pd.read_pickle(POSTS_DATA)

posts

Unnamed: 0,user_id,post_tags,direct
0,402970625,"[#MW3, @1UP, @ActuallyNPH, @CallOfDutyElite, @...",1
1,402970625,"[#Blops2, #COD, #MLG, #MW3, @Battlefield, @Cal...",0
2,402970625,"[#BF3, #FIFA12, #ME3, #MyMaddenCoverVote, #NHL...",0
3,402970625,"[#NOW, #NeedToLeave, #WayToGo, @MattxBarneyR, ...",0
4,402970625,"[#1, #Raw, #WWE, @CMPunk, @RandyOrton, @TheBel...",0
...,...,...,...
120719,158414847,"[#Happy420, @BarackObama, @BorowitzReport:, @C...",0
120720,158414847,"[#FF, #thankyou, @B_Hay, @ChrisRRegan, @ChrisR...",0
120721,158414847,"[#30Rock, #KalanColonRumors, @B_Hay, @B_Hay:, ...",0
120722,158414847,"[#30Rock, #FF, @BarackObama:, @BellHouseNY, @E...",0


In [51]:
np.random.seed(69)

posts_data = pd.merge(
    left=posts,
    right=users_data[['user_id', 'credibility']],
    on='user_id',
    how='inner'
)

posts_data = posts_data.sample(n=2000, random_state=69)

posts_data['post_credibility'] = np.random.normal(posts_data['credibility'], 0.2).clip(0, 1)
posts_data = posts_data.drop(columns=['credibility'])

posts_data

Unnamed: 0,user_id,post_tags,direct,post_credibility
6003,13275962,"[#FireSS, @CatherineCaine, @Darkseider, @GetGl...",0,0.922121
9924,176872879,"[#BLACKOPS2, #FF, @Fwiz, @MrWoofless, @YouTube...",0,0.632771
5474,121569946,"[#2, #3, #BlackOps2, @CRACKER1188, @Cnasty006,...",0,1.000000
1511,296218728,[@oingle],0,0.729981
855,17922109,"[#myign, @Dangeresque92, @GameOverGreggy, @Gam...",0,0.063194
...,...,...,...,...
9463,93906304,"[#Romney, @BorowitzReport:, @StevenCNN, @ggree...",0,1.000000
8140,437300342,"[#Oxford, #science, #synchrotrons, @..., @Astr...",0,0.625397
9019,13649212,"[@Veronica, @buckhollywood, @garyvee, @iTunesP...",0,0.842195
7514,13274152,"[#..., @kickstarter, @youtube]",0,0.911150


In [52]:
np.random.seed(69)

reacted_post_data = posts_data.copy()

reacted_post_data['likedBy'] = None
reacted_post_data['dislikedBy'] = None
reacted_post_data['trustedBy'] = None
reacted_post_data['distrustedBy'] = None

for i, post in tqdm(posts_data.iterrows(), total=len(posts_data)):
    user = users_data[users_data['user_id'] == post['user_id']].iloc[0]
    user_followers = followers[followers['target'] == post['user_id']]['source']
    
    followers_react_count = int(np.clip(np.random.normal(len(user_followers) * 2, len(user_followers) / 4), 0, len(user_followers)))
    non_followers_react_count = int(np.clip(np.random.normal(len(user_followers) * 8, len(user_followers) / 2), 0, len(user_followers)))

    followers_react = user_followers.sample(n=followers_react_count)
    non_followers_react = users[~users['user_id'].isin(followers_react)].sample(n=non_followers_react_count)['user_id']
    users_react = pd.concat([followers_react, non_followers_react]).reset_index(drop=True)
    users_react = users_data[users_data['user_id'].isin(users_react)][['user_id', 'credibility', 'integrity']]
    
    users_react['credibility_diff'] = (users_react['credibility'] - post['post_credibility']).abs()
    users_react['likelihood_of_trust'] = 1 - users_react['credibility_diff']
    users_react['likelihood_of_liking'] = 1 - users_react['integrity'] * users_react['credibility_diff']
    users_react['trust'] = users_react['likelihood_of_trust'] > np.random.rand(len(users_react))
    users_react['like'] = users_react['likelihood_of_liking'] > np.random.rand(len(users_react))

    reacted_post_data.loc[[i], 'likedBy'] = [[users_react[users_react['like'] == True]['user_id'].values]]
    reacted_post_data.loc[[i], 'dislikedBy'] = [[users_react[users_react['like'] == False]['user_id'].values]]
    reacted_post_data.loc[[i], 'trustedBy'] = [[users_react[users_react['trust'] == True]['user_id'].values]]
    reacted_post_data.loc[[i], 'distrustedBy'] = [[users_react[users_react['trust'] == False]['user_id'].values]]

reacted_post_data['likedBy'] = reacted_post_data['likedBy'].str[0]
reacted_post_data['dislikedBy'] = reacted_post_data['dislikedBy'].str[0]
reacted_post_data['trustedBy'] = reacted_post_data['trustedBy'].str[0]
reacted_post_data['distrustedBy'] = reacted_post_data['distrustedBy'].str[0]

reacted_post_data.to_csv(POSTS_DATA_CSV, index=False)

reacted_post_data

100%|██████████| 2000/2000 [00:12<00:00, 159.57it/s]


Unnamed: 0,user_id,post_tags,direct,post_credibility,likedBy,dislikedBy,trustedBy,distrustedBy
6003,13275962,"[#FireSS, @CatherineCaine, @Darkseider, @GetGl...",0,0.922121,[1548841],[],[1548841],[]
9924,176872879,"[#BLACKOPS2, #FF, @Fwiz, @MrWoofless, @YouTube...",0,0.632771,"[19898730, 34805698, 101859065]",[],"[19898730, 34805698, 101859065]",[]
5474,121569946,"[#2, #3, #BlackOps2, @CRACKER1188, @Cnasty006,...",0,1.000000,[],[],[],[]
1511,296218728,[@oingle],0,0.729981,[],[],[],[]
855,17922109,"[#myign, @Dangeresque92, @GameOverGreggy, @Gam...",0,0.063194,[],[],[],[]
...,...,...,...,...,...,...,...,...
9463,93906304,"[#Romney, @BorowitzReport:, @StevenCNN, @ggree...",0,1.000000,[18069824],[],[],[18069824]
8140,437300342,"[#Oxford, #science, #synchrotrons, @..., @Astr...",0,0.625397,[],[],[],[]
9019,13649212,"[@Veronica, @buckhollywood, @garyvee, @iTunesP...",0,0.842195,[],[],[],[]
7514,13274152,"[#..., @kickstarter, @youtube]",0,0.911150,[],[],[],[]


In [53]:
reacted_post_data['likedBy'].str.len().describe()

count    2000.000000
mean        1.045000
std         2.151346
min         0.000000
25%         0.000000
50%         0.000000
75%         1.000000
max        17.000000
Name: likedBy, dtype: float64