In [1]:
from os.path import (
    abspath,
    dirname,
    exists,
    join,
)
from pandas import (
    read_csv,
    merge,
    concat,
    DataFrame,
)
from sklearn.model_selection import train_test_split

In [2]:
PWD = (abspath(''))
datadir = join(PWD, '../Datasets/ml-1m/')
files = {}
files['rating'] = join(datadir, 'ratings.dat')
files['movie'] = join(datadir, 'movies.dat')
THRESHOLD_MIN = 200
train_size = .8

In [3]:
dfs = {}
for file in files:
    dfs[file] = read_csv(
        files[file],
        engine='python',
        encoding='latin1',
        sep='::',
        header=None,
    )

In [4]:
dfs['rating'].columns = ['userId', 'movieId', 'rating', 'timestamp']
dfs['rating'].head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [5]:
dfs['movie'].columns = ['movieId', 'title', 'genres']
dfs['movie'].head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
df = merge(
    left=dfs['rating'],
    right=dfs['movie'],
    left_on=[
        'movieId',
    ],
    right_on=[
        'movieId',
    ],
)

In [7]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5,978298413,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4,978220179,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1193,4,978199279,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1193,5,978158471,One Flew Over the Cuckoo's Nest (1975),Drama


In [8]:
df = df.drop(
    [
        'movieId',
        'timestamp',
        'genres',
    ],
    axis=1,
)

In [9]:
df.head()

Unnamed: 0,userId,rating,title
0,1,5,One Flew Over the Cuckoo's Nest (1975)
1,2,5,One Flew Over the Cuckoo's Nest (1975)
2,12,4,One Flew Over the Cuckoo's Nest (1975)
3,15,4,One Flew Over the Cuckoo's Nest (1975)
4,17,5,One Flew Over the Cuckoo's Nest (1975)


In [10]:
df.head()

Unnamed: 0,userId,rating,title
0,1,5,One Flew Over the Cuckoo's Nest (1975)
1,2,5,One Flew Over the Cuckoo's Nest (1975)
2,12,4,One Flew Over the Cuckoo's Nest (1975)
3,15,4,One Flew Over the Cuckoo's Nest (1975)
4,17,5,One Flew Over the Cuckoo's Nest (1975)


In [11]:
df.shape

(1000209, 3)

In [12]:
dfs['movie_group'] = df.groupby(
    by=['title'],
    as_index=False,
)[['userId']].agg(lambda x: len(x))

In [13]:
dfs['movie_group'].head()

Unnamed: 0,title,userId
0,"$1,000,000 Duck (1971)",37
1,'Night Mother (1986),70
2,'Til There Was You (1997),52
3,"'burbs, The (1989)",303
4,...And Justice for All (1979),199


In [14]:
dfs['movies'] = dfs['movie_group'][dfs['movie_group']['userId'] >= THRESHOLD_MIN]
dfs['movies'] = dfs['movies'].reset_index(drop=True)

In [15]:
dfs['movies'].head()

Unnamed: 0,title,userId
0,"'burbs, The (1989)",303
1,10 Things I Hate About You (1999),700
2,101 Dalmatians (1961),565
3,101 Dalmatians (1996),364
4,12 Angry Men (1957),616


In [16]:
dfs['movies'].shape

(1426, 2)

In [17]:
movies = dfs['movies']['title'].to_numpy()

In [18]:
dfs['pivot'] = df[df['title'].isin(movies)]
dfs['pivot'] = dfs['pivot'].reset_index(drop=True)
dfs['pivot'] = dfs['pivot'].pivot_table(
    index=['title'],
    columns=['userId'],
    values='rating',
)

In [19]:
dfs['pivot'].head()

userId,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"'burbs, The (1989)",,,,,,,,,,4.0,...,,,,,,,,,,
10 Things I Hate About You (1999),,,,,,,,,,,...,,,,,,2.0,,,,
101 Dalmatians (1961),,,,,,,,,,,...,,4.0,,,,4.0,,,,
101 Dalmatians (1996),,,,,,,,,,,...,,,,,1.0,,,,,
12 Angry Men (1957),,,,,,,,,,3.0,...,,,,,,,4.0,,,5.0


In [20]:
dfs['train'] = DataFrame()
dfs['test'] = DataFrame()
users = dfs['pivot'].columns.tolist()

In [21]:
tot_train = 0
tot_test = 0
for user in users[:]:
        cur = dfs['pivot'][user]
        cur = cur.dropna()
        cur = DataFrame({
            'title': cur.index,
            'rating': cur.values,
        })
        cur['userId'] = user
        #print(cur.head())
        n = cur.shape[0]*train_size
        n = int(n)
        train_data = cur[:n]
        test_data = cur[n:]
        tot_train += n
        tot_test += cur.shape[0]-n
        dfs['train'] = concat([dfs['train'], train_data], ignore_index=True)
        dfs['test'] = concat([dfs['test'], test_data], ignore_index=True)

In [22]:
leftover = dfs['test'][~dfs['test']['title'].isin(dfs['train']['title'])]
dfs['test'] = dfs['test'][dfs['test']['title'].isin(dfs['train']['title'])]
dfs['train'] = concat([dfs['train'], leftover], ignore_index=True)
dfs['train'] = dfs['train'].reset_index(drop=True)
dfs['test'] = dfs['test'].reset_index(drop=True)

In [23]:
dfs['train'].head()

Unnamed: 0,title,rating,userId
0,Airplane! (1980),4.0,1
1,Aladdin (1992),4.0,1
2,Antz (1998),4.0,1
3,Apollo 13 (1995),5.0,1
4,Awakenings (1990),5.0,1


In [24]:
dfs['test'].head()

Unnamed: 0,title,rating,userId
0,Snow White and the Seven Dwarfs (1937),4.0,1
1,"Sound of Music, The (1965)",5.0,1
2,Star Wars: Episode IV - A New Hope (1977),4.0,1
3,Tarzan (1999),3.0,1
4,Titanic (1997),4.0,1


In [25]:
dfs['train'].shape, dfs['test'].shape

((755865, 3), (99865, 3))

In [26]:
tot_train, tot_test

(682184, 173546)

In [27]:
dfs['test'][~dfs['test']['userId'].isin(dfs['train']['userId'])]

Unnamed: 0,title,rating,userId


In [28]:
dfs['train'][~dfs['train']['userId'].isin(dfs['test']['userId'])]

Unnamed: 0,title,rating,userId
88927,"20,000 Leagues Under the Sea (1954)",3.0,835
88928,"Blob, The (1958)",2.0,835
88929,"Day the Earth Stood Still, The (1951)",5.0,835
88930,"Fly, The (1958)",3.0,835
88931,Forbidden Planet (1956),4.0,835
...,...,...,...
752831,Tequila Sunrise (1988),4.0,5793
752832,Thunderball (1965),5.0,5793
752833,Touch of Evil (1958),5.0,5793
752834,When Harry Met Sally... (1989),3.0,5793


In [29]:
dfs['train'].to_csv(join(datadir, 'train.csv'), index=False)

In [30]:
dfs['test'].to_csv(join(datadir, 'test.csv'), index=False)