In [12]:
from datetime import datetime
import os
import numpy as np
import pandas as pd
from als import ALSModel
from utils import validation

np.random.seed(42)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Referenced https://github.com/storieswithsiva/Movie-Recommendation-Netflix/blob/master/Uploading%20Movie%20Recommendation%20Netflix.ipynb

In [4]:
start = datetime.now()
if not os.path.isfile('../data/archive/data.csv'):
    # Create a file 'data.csv' before reading it
    # Read all the files in netflix and store them in one big file('data.csv')
    # reading from each of the four files and appending each rating to a global file 'train.csv'
    data = open('../data/archive/data.csv', mode='w')
    
    row = list()
    files=['../data/archive/combined_data_1.txt','data/archive/combined_data_2.txt', 
           '../data/archive/combined_data_3.txt', 'data/archive/combined_data_4.txt']
    for file in files:
        print("Reading ratings from {}...".format(file))
        with open(file) as f:
            for line in f: 
                del row[:]
                line = line.strip()
                if line.endswith(':'):
                    # All below are ratings for this movie, until another movie appears.
                    movie_id = line.replace(':', '')
                else:
                    row = [x for x in line.split(',')]
                    row.insert(0, movie_id)
                    data.write(','.join(row))
                    data.write('\n')
        print("Done.\n")
    data.close()
print('Time taken :', datetime.now() - start)

Time taken : 0:00:00.000407


In [5]:
print("creating the dataframe from data.csv file..")
df = pd.read_csv('../data/archive/data.csv', sep=',', 
                       names=['movie', 'user','rating','date'])
df.date = pd.to_datetime(df.date)
print('Done.\n')

creating the dataframe from data.csv file..
Done.



In [6]:
df.head()

Unnamed: 0,movie,user,rating,date
0,1,1488844,3,2005-09-06
1,1,822109,5,2005-05-13
2,1,885013,4,2005-10-19
3,1,30878,4,2005-12-26
4,1,823519,3,2004-05-03


In [7]:
df.columns

Index(['movie', 'user', 'rating', 'date'], dtype='object')

In [8]:
df['id'] = df.index
df

Unnamed: 0,movie,user,rating,date,id
0,1,1488844,3,2005-09-06,0
1,1,822109,5,2005-05-13,1
2,1,885013,4,2005-10-19,2
3,1,30878,4,2005-12-26,3
4,1,823519,3,2004-05-03,4
...,...,...,...,...,...
100480502,17770,1790158,4,2005-11-01,100480502
100480503,17770,1608708,3,2005-07-19,100480503
100480504,17770,234275,1,2004-08-07,100480504
100480505,17770,255278,4,2004-05-28,100480505


In [17]:
print("Total data ")
print("-"*50)
print("\nTotal no of ratings :",df.shape[0])
print("Total No of Users   :", len(np.unique(df.user)))
print("Total No of movies  :", len(np.unique(df.movie)))

Total data 
--------------------------------------------------

Total no of ratings : 100480507
Total No of Users   : 480189
Total No of movies  : 17770


In [9]:
# Keep only users who rated at least 20 movies
df = df.groupby("user").filter(lambda x: len(x) >= 20)
len(np.unique(df.user))

429584

In [10]:
# Create training and validation sets
# Keep 10 random ratings of each user for validation
valid = df.groupby('user').apply(lambda x: x.sample(n=10, random_state=42))

In [11]:
valid = valid.reset_index(drop=True)
valid

Unnamed: 0,movie,user,rating,date,id
0,13050,6,3,2004-09-25,72076688
1,14302,6,4,2005-08-29,78803703
2,6221,6,3,2005-08-29,34655050
3,6196,6,4,2004-09-27,34367193
4,14203,6,3,2005-12-04,78100336
...,...,...,...,...,...
4295835,16792,2649429,5,2002-06-06,94625498
4295836,12047,2649429,5,2002-05-16,65688430
4295837,14550,2649429,5,2004-10-15,80892132
4295838,607,2649429,5,2005-06-27,3243822


In [12]:
train = df[~df.id.isin(valid.id)]
train

Unnamed: 0,movie,user,rating,date,id
0,1,1488844,3,2005-09-06,0
1,1,822109,5,2005-05-13,1
2,1,885013,4,2005-10-19,2
3,1,30878,4,2005-12-26,3
4,1,823519,3,2004-05-03,4
...,...,...,...,...,...
100480502,17770,1790158,4,2005-11-01,100480502
100480503,17770,1608708,3,2005-07-19,100480503
100480504,17770,234275,1,2004-08-07,100480504
100480505,17770,255278,4,2004-05-28,100480505


In [13]:
train.to_csv('../data/archive/train.csv', index=False, header=False)
valid.to_csv('../data/archive/valid.csv', index=False, header=False)

In [37]:
train = pd.read_csv('../data/archive/train.csv', names=['movie', 'user', 'rating', 'date', 'id'])
valid = pd.read_csv('../data/archive/valid.csv', names=['movie', 'user', 'rating', 'date', 'id'])

In [41]:
valid.head()

Unnamed: 0,movie,user,rating,date,id
0,13050,6,3,2004-09-25,72076688
1,14302,6,4,2005-08-29,78803703
2,6221,6,3,2005-08-29,34655050
3,6196,6,4,2004-09-27,34367193
4,14203,6,3,2005-12-04,78100336


In [None]:
# This is way too large to do calculations on. Let's reduce to 1000 users.

In [43]:
users = np.random.choice(valid['user'].unique(), size=1000, replace=False)

In [44]:
train[train['user'].isin(users)].to_csv('../data/archive/train_1000.csv', index=False, header=False)
valid[valid['user'].isin(users)].to_csv('../data/archive/valid_1000.csv', index=False, header=False)

In [3]:
train_data = pd.read_csv('../data/archive/train_1000.csv', names=['movie', 'user', 'rating', 'date', 'id'])
valid_data = pd.read_csv('../data/archive/valid_1000.csv', names=['movie', 'user', 'rating', 'date', 'id'])

In [4]:
valid_data

Unnamed: 0,movie,user,rating,date,id
0,30,471,3,2004-08-03,133131
1,2016,471,4,2004-10-01,10450215
2,12799,471,2,2005-08-05,70615047
3,8904,471,5,2003-09-30,49464581
4,4345,471,5,2004-06-29,23085613
...,...,...,...,...,...
9995,17215,2646104,5,2005-08-16,97344722
9996,9756,2646104,2,2005-08-16,53560703
9997,9614,2646104,5,2005-08-16,52798269
9998,11064,2646104,4,2005-08-16,60205753


In [10]:
# Create watch matrix
watch_matrix = pd.pivot_table(data=train_data, index='user', columns='movie', values='rating', fill_value=0)
watch_matrix = watch_matrix.applymap(lambda x: 1 if x > 0 else 0)
watch_matrix

movie,1,3,4,5,6,8,11,16,17,18,...,17761,17762,17763,17764,17765,17766,17767,17768,17769,17770
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
471,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5380,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9561,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
10808,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18836,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2631082,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2637575,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2638267,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2644392,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
# Watch matrix validation
watch_matrix_validation = pd.pivot_table(data=valid_data, index='user', columns='movie', values='rating', fill_value=0)
watch_matrix_validation = watch_matrix_validation.applymap(lambda x: 1 if x > 0 else 0)
watch_matrix_validation

movie,3,8,17,18,26,28,30,33,35,46,...,17693,17703,17709,17714,17715,17723,17724,17740,17741,17764
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
471,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5380,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9561,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10808,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18836,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2631082,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2637575,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2638267,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2644392,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
def random_classification_noise(watch_matrix, p):
    """
    watch_matrix: watch matrix of watched or not
    p: probability of disliking an interacted with item
    """
    # Create a copy of the watch matrix
    watch_matrix_noisy = watch_matrix.copy()
    
    # For each user, randomly select p% of their interacted with items and set them to 0
    watch_matrix_noisy = watch_matrix_noisy.applymap(lambda x: -1 if x == 1 and np.random.random() < p else x)

    return watch_matrix_noisy

In [13]:
# Generate preference matrix based on simulation for validation dataset.
# Start training for 20 epochs

In [14]:
for p in [0.25, 0.5, 0.75]:
    true_pref_validation = random_classification_noise(watch_matrix_validation, p)
    model = ALSModel(watch_matrix.values)
    train(model, train_data, valid_data, true_pref_validation, num_epochs=20)


movie,3,8,17,18,26,28,30,33,35,46,...,17693,17703,17709,17714,17715,17723,17724,17740,17741,17764
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
471,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5380,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9561,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10808,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18836,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2631082,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2637575,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2638267,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2644392,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
