This notebook splits the data into training and testing set for each user such that 80% ratings are in training and 20% are for testing.

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


In [4]:
movies = pd.read_csv('../../dataset/ml-latest/movies.csv')
ratings = pd.read_csv('../../dataset/ml-latest/ratings.csv')
tags = pd.read_csv('../../dataset/ml-latest/tags.csv')
print('movies: ', movies.shape)
print('ratings: ', ratings.shape)
print('tags: ', tags.shape)

movies:  (58098, 3)
ratings:  (27753444, 4)
tags:  (1108997, 4)


In [5]:
movies.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
ratings.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,307,3.5,1256677221
1,1,481,3.5,1256677456
2,1,1091,1.5,1256677471
3,1,1257,4.5,1256677460
4,1,1449,4.5,1256677264


In [7]:
tags.head(5)

Unnamed: 0,userId,movieId,tag,timestamp
0,14,110,epic,1443148538
1,14,110,Medieval,1443148532
2,14,260,sci-fi,1442169410
3,14,260,space action,1442169421
4,14,318,imdb top 250,1442615195


In [8]:
df = pd.merge(ratings, movies, on='movieId' , how='left')
df = df.drop('title', axis=1)
df.head(5)

Unnamed: 0,userId,movieId,rating,timestamp,genres
0,1,307,3.5,1256677221,Drama
1,1,481,3.5,1256677456,Drama|Thriller
2,1,1091,1.5,1256677471,Comedy
3,1,1257,4.5,1256677460,Comedy|Romance
4,1,1449,4.5,1256677264,Comedy


In [9]:
df['genres'] = df['genres'].str.split('|')

In [10]:
df.head(5)

Unnamed: 0,userId,movieId,rating,timestamp,genres
0,1,307,3.5,1256677221,[Drama]
1,1,481,3.5,1256677456,"[Drama, Thriller]"
2,1,1091,1.5,1256677471,[Comedy]
3,1,1257,4.5,1256677460,"[Comedy, Romance]"
4,1,1449,4.5,1256677264,[Comedy]


In [11]:
tags['tag'] = tags['tag'].str.split('|')
tags.drop('timestamp', axis=1, inplace=True)

In [12]:
tags = tags.groupby(['userId','movieId'])['tag'].apply(lambda x: ','.join(x.astype(str))).reset_index()
tags.head(5)

Unnamed: 0,userId,movieId,tag
0,14,110,"['epic'],['Medieval']"
1,14,260,"['sci-fi'],['space action']"
2,14,318,"['imdb top 250'],['justice']"
3,14,480,['Dinosaurs']
4,14,593,['psychothriller']


In [13]:
df = pd.merge(df, tags, on=['userId','movieId'], how='left')

In [14]:
df.shape

(27753444, 6)

In [15]:
df['tag'] = df['tag'].apply(lambda d: d if isinstance(d, list) else [])
df['genres'] = df['genres'].apply(lambda d: d if isinstance(d, list) else [])

In [16]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,genres,tag
0,1,307,3.5,1256677221,[Drama],[]
1,1,481,3.5,1256677456,"[Drama, Thriller]",[]
2,1,1091,1.5,1256677471,[Comedy],[]
3,1,1257,4.5,1256677460,"[Comedy, Romance]",[]
4,1,1449,4.5,1256677264,[Comedy],[]


In [17]:

# train_data, test_data = train_test_split(df, test_size=0.2, stratify=df['userId'])

# Check the distribution of userId classes
class_counts = df['userId'].value_counts()

# Remove classes with only one instance
single_instance_classes = class_counts[class_counts == 1].index
df = df[~df['userId'].isin(single_instance_classes)]

# Split the data with adjusted test_size
train_data, test_data = train_test_split(df, test_size=0.2, stratify=df['userId'])


#### Split into train and test data

In [18]:
train_data = train_data.sort_values(['userId', 'movieId'])
train_data.head()

Unnamed: 0,userId,movieId,rating,timestamp,genres,tag
0,1,307,3.5,1256677221,[Drama],[]
1,1,481,3.5,1256677456,"[Drama, Thriller]",[]
3,1,1257,4.5,1256677460,"[Comedy, Romance]",[]
4,1,1449,4.5,1256677264,[Comedy],[]
5,1,1590,2.5,1256677236,"[Horror, Sci-Fi, Thriller]",[]


In [19]:
test_data = test_data.sort_values(['userId','movieId'])
test_data.head()

Unnamed: 0,userId,movieId,rating,timestamp,genres,tag
2,1,1091,1.5,1256677471,[Comedy],[]
10,1,2986,2.5,1256677496,"[Action, Crime, Sci-Fi, Thriller]",[]
13,1,3698,3.5,1256677243,"[Action, Sci-Fi]",[]
20,2,1244,3.0,1192913551,"[Comedy, Drama, Romance]",[]
27,2,2707,3.5,1192913600,[Thriller],[]


#### Save the dataframes as csv files

In [20]:
# train_data.to_csv('training_data.csv', index = False)
# test_data.to_csv('testing_data.csv', index = False)

## Pre-process the movie data

In [21]:
movies['genres'] = movies['genres'].str.split('|')
movies['genres'] = movies['genres'].apply(lambda d: d if isinstance(d, list) else [])
movies.to_csv('pro_movies.csv', index = False)

In [22]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),[Comedy]
