In [1]:
import json
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split


## USER IDs MAPPING

In [2]:
org_id = []
remap_id = []

with open('yelp_academic_dataset_user.json') as f:
    for n,line in enumerate(f):
        user_id = json.loads(line)['user_id']
        org_id.append(user_id)
        remap_id.append(n)


In [3]:
user_df = pd.DataFrame({'org_id' :org_id, 'remap_id' : remap_id })
user_df

Unnamed: 0,org_id,remap_id
0,qVc8ODYU5SZjKXVBgXdI7w,0
1,j14WgRoU_-2ZE1aw1dXrJg,1
2,2WnXYQFK0hXEoTxPtV2zvg,2
3,SZDeASXq7o05mMNLshsdIA,3
4,hA5lMy-EnncsH4JoR-hFGQ,4
...,...,...
1987892,fB3jbHi3m0L2KgGOxBv6uw,1987892
1987893,68czcr4BxJyMQ9cJBm6C7Q,1987893
1987894,1x3KMskYxOuJCjRz70xOqQ,1987894
1987895,ulfGl4tdbrH05xKzh5lnog,1987895


In [4]:
# 1 USER  = 1 ROW
user_df.org_id.value_counts().max() == 1

True

### ITEMS IDs MAPPING

In [5]:
org_id = []
with open('yelp_academic_dataset_review.json') as f:
    for line in f:
        l = json.loads(line)
        user_id = json.loads(line)['business_id']
        org_id.append(user_id)



In [6]:
# USING SET TO DEFINE UNIQUE ID FOR ITEM
org_id = [i for i in set(org_id)]
remap_id = np.arange(0, len(org_id))

In [9]:
# 1 ITEM  = 1 ROW

len(org_id) == len(remap_id)

True

In [10]:
item_df = pd.DataFrame({'org_id' :org_id, 'remap_id' : remap_id })
item_df

Unnamed: 0,org_id,remap_id
0,67sXtMXFOjTiy9SZ9UvbLg,0
1,KPgJDcVPm3AYiQsPmbMZVg,1
2,0K4RwxdAcViifyU3Htzxww,2
3,JszJI2Ewenggj8zh1o3B-Q,3
4,pC_Hqw2xvOOxnycIfR2Xww,4
...,...,...
150341,vTNm04guTgOT5hCmGSN5HA,150341
150342,p3hwoEvtIH6cQjDKh-8eOg,150342
150343,N3rj63i20ENozejfnSXeyw,150343
150344,b7ih13lmlwnut_4xpk-x6Q,150344


In [11]:
# 1 ITEM  = 1 ROW

item_df.org_id.nunique() == item_df.shape[0]

True

### INTERACTIONS

In [12]:
user = []
item = []
with open('yelp_academic_dataset_review.json') as f:
    for n,line in enumerate(f):
        user_id = json.loads(line)['user_id']
        item_id = json.loads(line)['business_id']
        user.append(user_id)
        item.append(item_id)

In [13]:
inter_df = pd.DataFrame({'user' : user, 'item' : item})
inter_df

Unnamed: 0,user,item
0,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw
1,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ
2,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A
3,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA
4,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ
...,...,...
6990275,qskILQ3k0I_qcCMI-k6_QQ,jals67o91gcrD4DC81Vk6w
6990276,Zo0th2m8Ez4gLSbHftiQvg,2vLksaMmSEcGbjI5gywpZA
6990277,mm6E4FbCMwJmb7kPDZ5v2Q,R1khUUxidqfaJmcpmGd4aw
6990278,YwAMC-jvZ1fvEUum6QkEkw,Rr9kKArrMhSLVE9a53q-aA


## INTERACTION MAPPING

In [14]:
# USER
mapp1 = pd.merge(inter_df, user_df, how='left', left_on='user', right_on='org_id')

In [96]:
mapp1

Unnamed: 0,user,item,org_id,remap_id
0,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,mh_-eMZ6K5RLWhZyISBhwA,124234.0
1,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,OyoGAe7OKpv6SyGZT5g77Q,51327.0
2,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,8g_iMtfSiwikVnbP2etR0A,98159.0
3,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,_7bHUi9Uuf5__HHc_Q8guQ,164430.0
4,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,bcjbaE6dDog4jkNY91ncLQ,150111.0
...,...,...,...,...
6990275,qskILQ3k0I_qcCMI-k6_QQ,jals67o91gcrD4DC81Vk6w,qskILQ3k0I_qcCMI-k6_QQ,246849.0
6990276,Zo0th2m8Ez4gLSbHftiQvg,2vLksaMmSEcGbjI5gywpZA,Zo0th2m8Ez4gLSbHftiQvg,1192958.0
6990277,mm6E4FbCMwJmb7kPDZ5v2Q,R1khUUxidqfaJmcpmGd4aw,mm6E4FbCMwJmb7kPDZ5v2Q,116945.0
6990278,YwAMC-jvZ1fvEUum6QkEkw,Rr9kKArrMhSLVE9a53q-aA,YwAMC-jvZ1fvEUum6QkEkw,199546.0


In [15]:
 mapp2 = pd.merge(mapp1, item_df, how='left', left_on='item', right_on='org_id')

In [16]:
mapp2

Unnamed: 0,user,item,org_id_x,remap_id_x,org_id_y,remap_id_y
0,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,mh_-eMZ6K5RLWhZyISBhwA,124234.0,XQfwVwDr-v0ZS3_CbbE5Xw,41626
1,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,OyoGAe7OKpv6SyGZT5g77Q,51327.0,7ATYjTIgM3jUlt4UM3IypQ,110419
2,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,8g_iMtfSiwikVnbP2etR0A,98159.0,YjUWPpI6HXG530lwP-fb2A,124380
3,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,_7bHUi9Uuf5__HHc_Q8guQ,164430.0,kxX2SOes4o-D3ZQBkiMRfA,53248
4,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,bcjbaE6dDog4jkNY91ncLQ,150111.0,e4Vwtrqf-wpJfwesgvdgxQ,39816
...,...,...,...,...,...,...
6990275,qskILQ3k0I_qcCMI-k6_QQ,jals67o91gcrD4DC81Vk6w,qskILQ3k0I_qcCMI-k6_QQ,246849.0,jals67o91gcrD4DC81Vk6w,82931
6990276,Zo0th2m8Ez4gLSbHftiQvg,2vLksaMmSEcGbjI5gywpZA,Zo0th2m8Ez4gLSbHftiQvg,1192958.0,2vLksaMmSEcGbjI5gywpZA,63048
6990277,mm6E4FbCMwJmb7kPDZ5v2Q,R1khUUxidqfaJmcpmGd4aw,mm6E4FbCMwJmb7kPDZ5v2Q,116945.0,R1khUUxidqfaJmcpmGd4aw,44094
6990278,YwAMC-jvZ1fvEUum6QkEkw,Rr9kKArrMhSLVE9a53q-aA,YwAMC-jvZ1fvEUum6QkEkw,199546.0,Rr9kKArrMhSLVE9a53q-aA,18820


In [17]:
df = mapp2.drop(['user','item', 'org_id_x', 'org_id_y'], axis = 1)

In [18]:
df.columns = ['user_id', 'inter']
df = df.dropna()
df.user_id = df.user_id.astype(int)

In [19]:
print(df.user_id.nunique())
print(df.inter.nunique())


1987897
150346


In [91]:
df['user_int'] = df.groupby('user_id')['user_id'].transform('size')
##### **10-core setting [10], i.e., retaining users and items with at least ten interactions.**
dfs = df[df['user_int']>=10 ]


In [92]:
dfs

Unnamed: 0,user_id,inter,user_int
0,124234,41626,28
2,98159,124380,49
4,150111,39816,121
10,35411,129351,18
13,24713,5230,111
...,...,...,...
6990274,55742,26431,168
6990275,246849,82931,283
6990277,116945,44094,157
6990278,199546,18820,84


In [93]:
dfs = dfs.groupby('user_id').agg({'inter':lambda x: list(x)})
dfs['user_int'] =  dfs['inter'].apply(len)
dfs

Unnamed: 0_level_0,inter,user_int
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"[43134, 92669, 147552, 109541, 18961, 131759, ...",16
1,"[107406, 65677, 82050, 55662, 91928, 51879, 91...",78
2,"[39002, 39002, 48666, 11207, 18961, 142512, 22...",16
3,"[121759, 107406, 6583, 31487, 24319, 29605, 21...",30
4,"[138073, 143962, 85097, 75651, 118400, 102888,...",11
...,...,...
1483071,"[111609, 39996, 83655, 75298, 104196, 73747, 1...",10
1490033,"[8890, 3207, 61076, 91016, 47762, 59906, 90231...",10
1491359,"[42010, 122341, 53069, 53069, 79858, 80198, 33...",11
1584102,"[32536, 60153, 60153, 60153, 60153, 60153, 601...",12


In [117]:
# For each dataset, we randomly select 80% of historical
# interactions of each user to constitute the training set, and treat
# the remaining as the test set. From the training set, we randomly
# select 10% of interactions as validation set to tune hyper-parameters.
dfs['train'] = ''
dfs['test'] = ''
dfs['user_int'] =  dfs['inter'].apply(len)

train = []
test = []

for idx, row in dfs.iterrows():
    split = train_test_split(row['inter'],test_size=0.2, random_state=42)
    train.append(split[0])
    test.append(split[1])

In [123]:
dfs['train'] = train
dfs['test'] = test

In [140]:
dfs

Unnamed: 0_level_0,inter,user_int,train,test
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,"[43134, 92669, 147552, 109541, 18961, 131759, ...",16,"[133533, 62449, 5994, 40427, 147552, 73519, 18...","[43134, 92669, 131759, 41411]"
1,"[107406, 65677, 82050, 55662, 91928, 51879, 91...",78,"[45136, 79228, 69635, 51879, 105186, 107131, 8...","[100832, 107406, 10442, 100208, 51820, 86838, ..."
2,"[39002, 39002, 48666, 11207, 18961, 142512, 22...",16,"[82909, 88453, 30515, 36578, 48666, 61506, 189...","[39002, 39002, 142512, 75135]"
3,"[121759, 107406, 6583, 31487, 24319, 29605, 21...",30,"[145055, 108357, 23660, 121759, 24319, 85646, ...","[21474, 57303, 126392, 88375, 19862, 8073]"
4,"[138073, 143962, 85097, 75651, 118400, 102888,...",11,"[75242, 85097, 143962, 33481, 118400, 55979, 7...","[102888, 138073, 70061]"
...,...,...,...,...
1483071,"[111609, 39996, 83655, 75298, 104196, 73747, 1...",10,"[73747, 111609, 40069, 83655, 64366, 104196, 7...","[126086, 39996]"
1490033,"[8890, 3207, 61076, 91016, 47762, 59906, 90231...",10,"[59906, 8890, 946, 61076, 73781, 47762, 91016,...","[93854, 3207]"
1491359,"[42010, 122341, 53069, 53069, 79858, 80198, 33...",11,"[88453, 53069, 122341, 95892, 79858, 33026, 53...","[80198, 42010, 33026]"
1584102,"[32536, 60153, 60153, 60153, 60153, 60153, 601...",12,"[60153, 60153, 60153, 60153, 60153, 60153, 601...","[60153, 60153, 32536]"


In [141]:
dfs.to_csv('interactions_tidy.csv')

In [130]:
dfs_str = dfs.copy()
dfs_str['inter'] = [' '.join(map(str, l)) for l in dfs_str['inter']]
dfs_str['train'] = [' '.join(map(str, l)) for l in dfs_str['train']]
dfs_str['test'] = [' '.join(map(str, l)) for l in dfs_str['test']]



In [131]:
dfs_str

Unnamed: 0_level_0,inter,user_int,train,test
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,43134 92669 147552 109541 18961 131759 139126 ...,16,133533 62449 5994 40427 147552 73519 18961 106...,43134 92669 131759 41411
1,107406 65677 82050 55662 91928 51879 91597 113...,78,45136 79228 69635 51879 105186 107131 87723 12...,100832 107406 10442 100208 51820 86838 4294 91...
2,39002 39002 48666 11207 18961 142512 22225 136...,16,82909 88453 30515 36578 48666 61506 18961 1360...,39002 39002 142512 75135
3,121759 107406 6583 31487 24319 29605 21793 253...,30,145055 108357 23660 121759 24319 85646 29605 3...,21474 57303 126392 88375 19862 8073
4,138073 143962 85097 75651 118400 102888 103162...,11,75242 85097 143962 33481 118400 55979 75651 10...,102888 138073 70061
...,...,...,...,...
1483071,111609 39996 83655 75298 104196 73747 109354 4...,10,73747 111609 40069 83655 64366 104196 75298 10...,126086 39996
1490033,8890 3207 61076 91016 47762 59906 90231 946 93...,10,59906 8890 946 61076 73781 47762 91016 90231,93854 3207
1491359,42010 122341 53069 53069 79858 80198 33026 330...,11,88453 53069 122341 95892 79858 33026 53069 33026,80198 42010 33026
1584102,32536 60153 60153 60153 60153 60153 60153 6015...,12,60153 60153 60153 60153 60153 60153 60153 6015...,60153 60153 32536


In [137]:
dfs_str = dfs_str.drop('inter', axis = 1)
dfs_str.reset_index(inplace=True)
dfs_str

Unnamed: 0,user_id,user_int,train,test
0,0,16,133533 62449 5994 40427 147552 73519 18961 106...,43134 92669 131759 41411
1,1,78,45136 79228 69635 51879 105186 107131 87723 12...,100832 107406 10442 100208 51820 86838 4294 91...
2,2,16,82909 88453 30515 36578 48666 61506 18961 1360...,39002 39002 142512 75135
3,3,30,145055 108357 23660 121759 24319 85646 29605 3...,21474 57303 126392 88375 19862 8073
4,4,11,75242 85097 143962 33481 118400 55979 75651 10...,102888 138073 70061
...,...,...,...,...
117365,1483071,10,73747 111609 40069 83655 64366 104196 75298 10...,126086 39996
117366,1490033,10,59906 8890 946 61076 73781 47762 91016 90231,93854 3207
117367,1491359,11,88453 53069 122341 95892 79858 33026 53069 33026,80198 42010 33026
117368,1584102,12,60153 60153 60153 60153 60153 60153 60153 6015...,60153 60153 32536


## GENERATING TRAIN AND TEST THE SAM WAY AS REQUIRED BY THE CODE

In [142]:
with open('train.txt', 'w') as f:
    for r,c in dfs_str.iterrows():
        u = str(c['user_id'])
        i = str(c['train'])
        f.write(f'{u} {i}' + '\n')

In [143]:
with open('test.txt', 'w') as f:
    for r,c in dfs_str.iterrows():
        u = str(c['user_id'])
        i = str(c['test'])
        f.write(f'{u} {i}' + '\n')