In [1]:
import pandas as pd
from tqdm import tqdm

## Adding Frequency

In [2]:
ratings_data = pd.read_csv("raw/Yelp_checkins.txt", sep='\t', names=['uid', 'iid', 'time'])
ratings_data.head()

Unnamed: 0,uid,iid,time
0,0,0,1185638000.0
1,0,1,1185638000.0
2,0,2,1185638000.0
3,0,3,1185638000.0
4,0,4,1185638000.0


In [3]:
len(ratings_data.iid.unique())

16621

In [4]:
useritem_freq = dict()

for index, eachline in tqdm(enumerate(ratings_data.itertuples(index=True))):
    if (eachline.uid, eachline.iid) in useritem_freq.keys():
        useritem_freq[(eachline.uid, eachline.iid)] += 1
    elif (eachline.uid, eachline.iid) not in useritem_freq.keys():
        useritem_freq[(eachline.uid, eachline.iid)] = 1

301753it [00:00, 409442.15it/s]


### Creating new rating file

In [5]:
ratings = open('raw/ratings.csv', 'w')
ratings.write('userId,itemId,rating\n')
for useritem, freq in useritem_freq.items():
    ratings.write(f"{useritem[0]},{useritem[1]},{freq}\n")
ratings.close()

## POI Category

In [6]:
item_data = pd.read_csv("raw/Yelp_poi_categories.txt", sep='\t', names=['iid', 'category'])
item_data.head()

Unnamed: 0,iid,category
0,17,0
1,17,1
2,17,2
3,69,2
4,69,3


In [7]:
len(item_data.iid.unique())

16610

In [8]:
item_category = dict()

for eachline in item_data.itertuples(index=True):
    if eachline.iid in item_category.keys():
        item_category[eachline.iid].add(str(eachline.category))
    elif eachline.iid not in item_category.keys():
        item_category[eachline.iid] = {str(eachline.category)}

In [9]:
len(item_category.keys())

16610

In [10]:
pois = open('raw/pois.csv', 'w')
pois.write('itemId,category\n')
for itemId, categories in item_category.items():
    POI_categories = "|".join(categories)
    pois.write(f"{itemId},{POI_categories}\n")
pois.close()

## IDs to Index

In [11]:
ratings_data = pd.read_csv("raw/ratings.csv")
# category: Movie Geners
pois_data = pd.read_csv("raw/pois.csv")

In [12]:
ratings_data.head()

Unnamed: 0,userId,itemId,rating
0,0,0,1
1,0,1,1
2,0,2,1
3,0,3,1
4,0,4,1


In [13]:
pois_data.head()

Unnamed: 0,itemId,category
0,17,2|0|1
1,69,2|3
2,779,5|4
3,76,2|7|6
4,679,8|9


In [14]:
# convert user and item ids to user and item index (from 0 to n: the number of users/items)

uid_to_uidx = dict()
iid_to_iidx = dict()

# user and item index counter
uidx_cnt = 0
iidx_cnt = 0

for eachsample in ratings_data.itertuples(index=True):
    # users index
    if eachsample.userId not in uid_to_uidx.keys():
        uid_to_uidx[eachsample.userId] = uidx_cnt
        uidx_cnt += 1

    # items index
    if eachsample.itemId not in iid_to_iidx.keys():
        iid_to_iidx[eachsample.itemId] = iidx_cnt
        iidx_cnt += 1

In [15]:
# to write the rating data using new indices

dataset_data = open("raw/Yelp_data_map.txt", 'w')

for eachsample in ratings_data.itertuples(index=True):
    dataset_data.write(f"{str(uid_to_uidx[eachsample.userId])},{str(iid_to_iidx[eachsample.itemId])},{str(eachsample.rating)}\n")

dataset_data.close()

In [16]:
# to write the movie data using new indices

genre_data = open("raw/Yelp_cat_map.txt", 'w')

for eachsample in pois_data.itertuples(index=True):
    if eachsample.itemId in iid_to_iidx.keys():
        genre_data.write(f"{str(iid_to_iidx[eachsample.itemId])},{str(eachsample.category)}\n")

genre_data.close()