#MovieLens 1M Dataset

A widely used dataset for recommendatitions. This dataset contains 1 000 209 anonymous ratings of approximately 3 900 movies made by 6 040 users who
joined in 2000

In [1]:
import numpy as np
import pandas as pd
import os
import json

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
reviews = pd.read_csv('/content/drive/MyDrive/Datasets/ratings.dat', names=['userId', 'movieId', 'rating', 'time'], delimiter='::', engine='python')
movies = pd.read_csv('/content/drive/MyDrive/Datasets/movies.dat', names=['movieId', 'movie_names', 'genres'], delimiter='::', encoding='ISO-8859-1')
users = pd.read_csv('/content/drive/MyDrive/Datasets/users.dat', names=['userId', 'gender', 'age', 'occupation', 'zip'], delimiter='::', engine='python')

  movies = pd.read_csv('/content/drive/MyDrive/Datasets/movies.dat', names=['movieId', 'movie_names', 'genres'], delimiter='::', encoding='ISO-8859-1')


In [4]:
print('Reviews shape:', reviews.shape)
print('Users shape:', users.shape)
print('Movies shape:', movies.shape)

Reviews shape: (1000209, 4)
Users shape: (6040, 5)
Movies shape: (3883, 3)


In [5]:
users.head()

Unnamed: 0,userId,gender,age,occupation,zip
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [6]:
reviews.head()

Unnamed: 0,userId,movieId,rating,time
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [7]:
# we drop users' zip code because we will not use it
users.drop(['zip'], axis=1, inplace=True)

In [8]:
movielens_df = reviews.merge(movies, on='movieId', how='left').merge(users, on='userId', how='left')

print('Merged dataframe shape:', movielens_df.shape)

Merged dataframe shape: (1000209, 9)


In [9]:
#check the dataset for any missing values
movielens_df.isna().sum()

userId         0
movieId        0
rating         0
time           0
movie_names    0
genres         0
gender         0
age            0
occupation     0
dtype: int64

For the MovieLens1M dataset, we consider all ratings as implicit
feedback, where each rating score is converted to either 1 or 0 to
indicate whether a user rated a movie

In [10]:
threshold = 3
movielens_df['implicit_feedback'] = (movielens_df['rating'] >= threshold).astype(int)

In [11]:
movielens_df

Unnamed: 0,userId,movieId,rating,time,movie_names,genres,gender,age,occupation,implicit_feedback
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama,F,1,10,1
1,1,661,3,978302109,James and the Giant Peach (1996),Animation|Children's|Musical,F,1,10,1
2,1,914,3,978301968,My Fair Lady (1964),Musical|Romance,F,1,10,1
3,1,3408,4,978300275,Erin Brockovich (2000),Drama,F,1,10,1
4,1,2355,5,978824291,"Bug's Life, A (1998)",Animation|Children's|Comedy,F,1,10,1
...,...,...,...,...,...,...,...,...,...,...
1000204,6040,1091,1,956716541,Weekend at Bernie's (1989),Comedy,M,25,6,0
1000205,6040,1094,5,956704887,"Crying Game, The (1992)",Drama|Romance|War,M,25,6,1
1000206,6040,562,5,956704746,Welcome to the Dollhouse (1995),Comedy|Drama,M,25,6,1
1000207,6040,1096,4,956715648,Sophie's Choice (1982),Drama,M,25,6,1


#Gowalla

A location-based social networking website where users share their locations by checking-in. It mainly collects the check-ins of these users over the period from Feb. 2009 to Oct. 2010.


In [12]:
gowalla_df = pd.read_csv('/content/drive/MyDrive/Datasets/Gowalla_totalCheckins.txt', sep='\t', header=None)
gowalla_df.columns = ['userid','timestamp','latitude','longitude','spotid']
gowalla_df.head()

Unnamed: 0,userid,timestamp,latitude,longitude,spotid
0,0,2010-10-19T23:55:27Z,30.235909,-97.79514,22847
1,0,2010-10-18T22:17:43Z,30.269103,-97.749395,420315
2,0,2010-10-17T23:42:03Z,30.255731,-97.763386,316637
3,0,2010-10-17T19:26:05Z,30.263418,-97.757597,16516
4,0,2010-10-16T18:50:42Z,30.274292,-97.740523,5535878


In [13]:
gowalla_df.isna().sum()

userid       0
timestamp    0
latitude     0
longitude    0
spotid       0
dtype: int64

For sparser dataset such as Gowalla, we use the 10-core setting to ensure that all users and items have at least 10 interactions

In [14]:
# Count interactions per user and item
user_interactions = gowalla_df['userid'].value_counts()
item_interactions = gowalla_df['spotid'].value_counts()

# Keep only users with at least 10 interactions
selected_users = user_interactions[user_interactions >= 10].index
gowalla_df = gowalla_df[gowalla_df['userid'].isin(selected_users)]

# Keep only items (spots) with at least 10 interactions
selected_items = item_interactions[item_interactions >= 10].index
gowalla_df = gowalla_df[gowalla_df['spotid'].isin(selected_items)]

gowalla_df.reset_index(drop=True, inplace=True)

In [15]:
gowalla_df

Unnamed: 0,userid,timestamp,latitude,longitude,spotid
0,0,2010-10-19T23:55:27Z,30.235909,-97.795140,22847
1,0,2010-10-18T22:17:43Z,30.269103,-97.749395,420315
2,0,2010-10-17T23:42:03Z,30.255731,-97.763386,316637
3,0,2010-10-17T19:26:05Z,30.263418,-97.757597,16516
4,0,2010-10-16T18:50:42Z,30.274292,-97.740523,5535878
...,...,...,...,...,...
3487253,196578,2010-06-12T10:47:18Z,51.743782,-0.495793,1160482
3487254,196578,2010-06-11T15:42:58Z,51.746712,-0.514305,467635
3487255,196578,2010-06-11T15:42:20Z,51.746296,-0.487218,797460
3487256,196578,2010-06-11T13:32:26Z,51.742988,-0.488065,906885


#Yelp2018

Released by the Yelp challenge that consists of a subset of the businesses, reviews, and user data. The Yelp2018 version is used in the experiments.


In [16]:
#yelp_business = pd.read_csv('/content/drive/MyDrive/Datasets/yelp_business.csv')
#yelp_user = pd.read_csv('/content/drive/MyDrive/Datasets/yelp_user.csv', nrows = 10000)
yelp_review = pd.read_csv('/content/drive/MyDrive/Datasets/yelp_review.csv',nrows = 100000)

We use the 10-core setting to ensure that all users and items have at least 10 interactions

In [17]:
# Assuming 'user_id' and 'business_id' are the columns representing users and items respectively in the review dataset
user_interactions = yelp_review['user_id'].value_counts()
item_interactions = yelp_review['business_id'].value_counts()

# Keep only users with at least 10 interactions
selected_users = user_interactions[user_interactions >= 10].index
yelp_review = yelp_review[yelp_review['user_id'].isin(selected_users)]

# Keep only items (businesses) with at least 10 interactions
selected_items = item_interactions[item_interactions >= 10].index
yelp_review = yelp_review[yelp_review['business_id'].isin(selected_items)]

# Merge user and review datasets based on user_id
#merged_yelp_df = pd.merge(yelp_user, yelp_review, left_on='user_id', right_on='user_id', how='inner')
#merged_yelp_df.reset_index(drop=True, inplace=True)

In [18]:
yelp_review

Unnamed: 0,review_id,user_id,business_id,stars,date,text,useful,funny,cool
13,lsoSqIrrDbQvWpMvsSj2xw,u0LXt3Uea_GidxRW1xcsfg,RtUvSWO_UZ8V3Wpj0n077w,3,2012-12-03,Wanted to check out this place due to all the ...,2,1,1
15,FunI9om-aK5oMIIJmhMlZA,u0LXt3Uea_GidxRW1xcsfg,0W4lkclzZThpx3V65bVgig,4,2011-09-28,a must stop when you are in montreal!\n\nI was...,0,0,0
19,WYDFJOBOl7cycd7gN-c_xA,u0LXt3Uea_GidxRW1xcsfg,zgQHtqX0gqMw1nlBZl2VnQ,1,2012-10-30,really excited to hear of this restaurant comi...,9,2,1
27,_ku1sDwkmQo2wIgWAaluZw,u0LXt3Uea_GidxRW1xcsfg,tOhRQqiupLyJdBJVQMGOEQ,5,2012-09-23,OMG - Definitely worth going if you are in Mon...,1,1,1
28,Enuk_DJbK0JPmgbFU8ePKw,u0LXt3Uea_GidxRW1xcsfg,N93EYZy9R0sdlEvubu94ig,3,2012-09-23,"Not sure what the hype is, but decided to give...",0,0,0
...,...,...,...,...,...,...,...,...,...
99956,arC-bzIYpm_jIHMNLBdkDA,CKRfBUqQGaVCYTKN5kDrzw,jsuUmIEefPjV__ads62Z5w,5,2015-08-04,Fabulousness in a dish! I had the huevos ranch...,0,0,0
99973,djYGcXgdX5JmG1jb0-iKeg,CKRfBUqQGaVCYTKN5kDrzw,tCSlpwJQ4CZsUEMZeH2SFg,5,2010-12-03,"This place has it all--great food, wonderful a...",1,0,1
99974,9Dv5PsbLMrIQkZANsX5Zmw,CKRfBUqQGaVCYTKN5kDrzw,mC39IrCp36QIVFRZIw9PTQ,5,2017-10-13,"Excellent fusion bbq, we tried the pork belly ...",0,0,0
99981,1tOnOzSVuP2NdXJWqlV8vQ,CKRfBUqQGaVCYTKN5kDrzw,s8OLoPfOpB0FbK5frI3CkQ,3,2015-08-30,Love the hot bacon and tomato salad with finge...,0,0,0


#Amazon-Book


Сomprises a vast corpus of user reviews, ratings, timestamps, and product metadata gathered from Amazon.com. For our experiments, we select the largest category available, namely Book.


In [42]:
amazon_book = pd.read_csv('/content/drive/MyDrive/Datasets/Books_rating.csv', nrows = 500000) #original dataset contains 3000000 rows
amazon_book.head(5)

Unnamed: 0,Id,Title,Price,User_id,profileName,review/helpfulness,review/score,review/time,review/summary,review/text
0,1882931173,Its Only Art If Its Well Hung!,,AVCGYZL8FQQTD,"Jim of Oz ""jim-of-oz""",7/7,4.0,940636800,Nice collection of Julie Strain images,This is only for Julie Strain fans. It's a col...
1,826414346,Dr. Seuss: American Icon,,A30TK6U7DNS82R,Kevin Killian,10/10,5.0,1095724800,Really Enjoyed It,I don't care much for Dr. Seuss but after read...
2,826414346,Dr. Seuss: American Icon,,A3UH4UZ4RSVO82,John Granger,10/11,5.0,1078790400,Essential for every personal and Public Library,"If people become the books they read and if ""t..."
3,826414346,Dr. Seuss: American Icon,,A2MVUWT453QH61,"Roy E. Perry ""amateur philosopher""",7/7,4.0,1090713600,Phlip Nel gives silly Seuss a serious treatment,"Theodore Seuss Geisel (1904-1991), aka &quot;D..."
4,826414346,Dr. Seuss: American Icon,,A22X4XUPKF66MR,"D. H. Richards ""ninthwavestore""",3/3,4.0,1107993600,Good academic overview,Philip Nel - Dr. Seuss: American IconThis is b...


In [30]:
len(amazon_book)

3000000

In [43]:
amazon_book.isna().sum()

Id                         0
Title                      7
Price                 419901
User_id                96346
profileName            96358
review/helpfulness         0
review/score               0
review/time                0
review/summary             4
review/text                1
dtype: int64

In [44]:
# we drop columns because they're not relevant for the recommendation task
amazon_book.drop(['Id', 'Price', 'review/helpfulness', 'review/summary', 'review/text'], axis=1, inplace=True)

We use the 10-core setting to ensure that all users and items have at least 10 interactions

In [45]:
min_interactions = 10
user_counts = amazon_book['User_id'].value_counts()
item_counts = amazon_book['Title'].value_counts()

# Filter users and items with at least 10 interactions
amazon_book = amazon_book[amazon_book['User_id'].isin(user_counts[user_counts >= min_interactions].index)]
amazon_book = amazon_book[amazon_book['Title'].isin(item_counts[item_counts >= min_interactions].index)]

In [46]:
# Reset Index
amazon_book = amazon_book.reset_index(drop=True)

In [47]:
len(amazon_book)

44894

In [48]:
amazon_book

Unnamed: 0,Title,User_id,profileName,review/score,review/time
0,Eyewitness Travel Guide to Europe,A281NPSIMI1C2R,"Rebecca of Amazon ""The Rebecca Review""",5.0,1023235200
1,Eyewitness Travel Guide to Europe,A2TAPL67U2A5HM,Bjorn Viberg,5.0,1111190400
2,Eyewitness Travel Guide to Europe,AT9YSY20RJUDX,"M. A. ZAIDI ""Ali Zaidi""",4.0,1033689600
3,Voices from the Farm: Adventures in Community ...,A1ER5AYS3FQ9O3,"K. Corn ""reviewer""",5.0,1160870400
4,Tess and the Highlander,A2VCGJLKGK2WJJ,Rebecca Herman,5.0,1035244800
...,...,...,...,...,...
44889,Atlas Shrugged,AFYYHRPSFBLWS,Ash Ryan,5.0,1264896000
44890,Atlas Shrugged,A8F2AZWB20X1H,JLind555,5.0,958953600
44891,Atlas Shrugged,A3ODVVP9XO42L,Steven R. Travers,5.0,1080950400
44892,Atlas Shrugged,A30ONBQ70SMRAQ,khettrich,5.0,993254400
