In [186]:
import pandas as pd
import numpy as np

'''
there are >95000 actors, >4000 directors, >5000 tags. 
one-hot-encoding is not practical at this point, so we will not be using them as features.

i am using pre-trained Google News Word2Vec model for tags aggregating

'''


# for each tag get average embeddings for each word,
# multiply by the weight of that tag, 
# and take the average of all tags
def get_avg_embedding_for_movie(row):
    result = []
    try:
        for _, group in groupped.get_group(row['movieID']).iterrows():
            text = np.array([w[word] for word in group['value'].split() if word in w.vocab])
            if text.size>0:
                result.append(text.mean(axis=0)*group['tagWeight'])
    except KeyError:
        return np.zeros(300)
    return np.mean(result, axis=0)      


datapath = '../../data/'
hetrec = datapath + 'hetrec2011-movielens-2k-v2/'
user_item_matrix = pd.read_csv(datapath + 'ml-latest-small/ratings.csv', usecols=[0, 1, 2]).rename(columns={'movieId':'movieID'})
movies = pd.read_csv(datapath + 'ml-latest-small/movies.csv', usecols=[0, 1])

# get rt metadata
movies_meta = pd.read_csv(hetrec + 'movies.dat', sep='\t', encoding='raw_unicode_escape').rename(columns={'id': 'movieID'}).drop(['spanishTitle', 'imdbID', 'title', 'imdbPictureURL', 'rtID', 'rtAllCriticsNumReviews', 'rtTopCriticsNumReviews', 'rtPictureURL'], axis=1)
movies_meta = movies_meta[movies_meta['movieID'].isin(user_item_matrix['movieID'].unique())]

# get the average tags embeddings
movie_tags = pd.read_csv(hetrec + 'movie_tags.dat', sep='\t')
tags = pd.read_csv(hetrec + 'tags.dat', sep='\t',  encoding='raw_unicode_escape').rename(columns={'id': 'tagID'})
movie_tags = pd.merge(movie_tags, tags, on='tagID').sort_values(by=['movieID', 'tagID']).drop(['tagID'], axis=1)
groupped = movie_tags.groupby('movieID')
embedded_tags = pd.DataFrame.from_records(movies_meta.apply(get_avg_embedding_for_movie, axis=1), columns=[f'w2v_{i}' for i in range(1, 301)])
movies_meta = pd.concat([movies_meta, embedded_tags], axis=1)

user_item_matrix = pd.merge(user_item_matrix, movies_meta, on='movieID').sort_values(by=['userId','movieID'])

# get a one-hot-encode-esque matrix of genres, then join on them
movie_genres = pd.read_csv(hetrec + 'movie_genres.dat', sep='\t').pivot_table(index=['movieID'], columns=['genre'], aggfunc=[len], fill_value=0)
movie_genres.columns = movie_genres.columns.droplevel(0)
movie_genres = movie_genres.reset_index()
user_item_matrix = pd.merge(user_item_matrix, movie_genres, on='movieID')

# get a one-hot-encode matrix of countries, then join on them
movie_countries = pd.get_dummies(pd.read_csv(hetrec + 'movie_countries.dat', sep='\t'))
user_item_matrix = pd.merge(user_item_matrix, movie_countries, on='movieID').sort_values(by=['userId','movieID'])
user_item_matrix.head()


Unnamed: 0,userId,movieID,rating,year,rtAllCriticsRating,rtAllCriticsNumFresh,rtAllCriticsNumRotten,rtAllCriticsScore,rtTopCriticsRating,rtTopCriticsNumFresh,...,country_Taiwan,country_Thailand,country_Tunisia,country_Turkey,country_UK,country_USA,country_Venezuela,country_Vietnam,country_West Germany,country_Yugoslavia
0,1,1,4.0,1995,9.0,73,0,100,8.5,17,...,0,0,0,0,0,1,0,0,0,0
215,1,3,4.0,1993,5.9,24,12,66,7.0,5,...,0,0,0,0,0,1,0,0,0,0
267,1,6,4.0,1995,7.7,50,8,86,7.2,14,...,0,0,0,0,0,1,0,0,0,0
369,1,47,5.0,1954,9.2,49,0,100,8.6,10,...,0,0,0,0,0,0,0,0,0,0
572,1,50,5.0,1995,7.5,41,6,87,6.9,12,...,0,0,0,0,0,1,0,0,0,0


In [220]:
embedded_tags

Unnamed: 0,w2v_1,w2v_2,w2v_3,w2v_4,w2v_5,w2v_6,w2v_7,w2v_8,w2v_9,w2v_10,...,w2v_291,w2v_292,w2v_293,w2v_294,w2v_295,w2v_296,w2v_297,w2v_298,w2v_299,w2v_300
0,0.146633,0.034206,0.023564,0.379975,-0.0568,0.03737,0.09982,-0.142083,0.062995,0.079154,...,-0.42808,0.298986,-0.227825,-0.051575,0.01331,-0.135927,0.023487,-0.110933,-0.033686,-0.043559
1,-0.001071,0.205297,-0.092585,0.206977,0.120545,-0.08957,-0.014487,-0.038442,0.108484,0.103163,...,-0.205724,0.245217,-0.046846,0.029018,0.035965,-0.008839,-0.062912,0.002716,0.062721,0.09147
2,0.093338,0.031871,0.06076,0.023626,0.037938,0.0012,-0.098043,-0.084717,0.066945,0.095866,...,-0.031238,0.073242,-0.065247,-0.009633,0.006348,-0.107625,-0.077189,-0.067759,-0.026549,0.06987
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.061117,-0.022502,-0.225342,0.215394,-0.019953,0.080668,0.054207,-0.241842,0.179227,0.064585,...,-0.156729,0.016235,-0.275055,-0.07257,0.16217,0.133728,-0.202593,-0.000427,0.149015,0.108437


In [1]:
from gensim import models

w = models.KeyedVectors.load_word2vec_format('/media/thejdxfh/Windows/Users/volok/Desktop/GoogleNews-vectors-negative300.bin', binary=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [223]:
# user_item_matrix.shape
give_test = lambda obj: obj.loc[np.random.choice(obj.index, len(obj.index)//10),:]
test_data = user_item_matrix.groupby('userId', as_index=False).apply(give_test).reset_index(level=0, drop=True)
train_data = user_item_matrix[~user_item_matrix.index.isin(test_data.index)]
train_data.head()

In [247]:
def get_avg_embedding_for_movie(row):
    global groupped

    result = []
    try:
        print(row['movieID'])
        for _, group in groupped.get_group(row['movieID']).iterrows():
            text = np.array([w[word] for word in group['value'].split() if word in w.vocab])
            if text.size > 0:
                result.append(text.mean(axis=0) * group['tagWeight'])
    except KeyError:
        return np.zeros(300)
    if len(result)<1:
        return np.zeros(300)
    print(np.mean(result, axis=0))
    return np.mean(result, axis=0)


# def process_data(device, batch_size):
    

datapath = '../data/'
hetrec = datapath + 'hetrec2011-movielens-2k-v2/'
user_item_matrix = pd.read_csv(datapath + 'ml-latest-small/ratings.csv', usecols=[0, 1, 2]).rename(columns={'movieId': 'movieID'})
movies = pd.read_csv(datapath + 'ml-latest-small/movies.csv', usecols=[0, 1])

# get rt metadata
movies_meta = pd.read_csv(hetrec + 'movies.dat', sep='\t', encoding='raw_unicode_escape').rename(columns={'id': 'movieID'}).drop(['spanishTitle', 'imdbID', 'title', 'imdbPictureURL', 'rtID', 'rtAllCriticsNumReviews', 'rtTopCriticsNumReviews', 'rtPictureURL'], axis=1)
movies_meta = movies_meta[movies_meta['movieID'].isin(user_item_matrix['movieID'].unique())]

# get the average tags embeddings
movie_tags = pd.read_csv(hetrec + 'movie_tags.dat', sep='\t')
tags = pd.read_csv(hetrec + 'tags.dat', sep='\t', encoding='raw_unicode_escape').rename(columns={'id': 'tagID'})
movie_tags = pd.merge(movie_tags, tags, on='tagID').sort_values(by=['movieID', 'tagID']).drop(['tagID'], axis=1)
groupped = movie_tags.groupby('movieID')

embedded_tags = pd.DataFrame.from_records(movies_meta.tail().apply(get_avg_embedding_for_movie, axis=1), columns=[f'w2v_{i}' for i in range(1, 301)])
# embedded_tags
#     movies_meta = pd.concat([movies_meta, embedded_tags], axis=1)

#     user_item_matrix = pd.merge(user_item_matrix, movies_meta, on='movieID').sort_values(by=['userId', 'movieID'])

#     # get a one-hot-encode-esque matrix of genres, then join on them
#     movie_genres = pd.read_csv(hetrec + 'movie_genres.dat', sep='\t').pivot_table(index=['movieID'], columns=['genre'], aggfunc=[len], fill_value=0)
#     movie_genres.columns = movie_genres.columns.droplevel(0)
#     movie_genres = movie_genres.reset_index()
#     user_item_matrix = pd.merge(user_item_matrix, movie_genres, on='movieID')

#     # get a one-hot-encode matrix of countries, then join on them
#     movie_countries = pd.get_dummies(pd.read_csv(hetrec + 'movie_countries.dat', sep='\t'))
#     user_item_matrix = pd.merge(user_item_matrix, movie_countries, on='movieID').sort_values(by=['userId', 'movieID'])

#     user_item_matrix.to_csv('user_item_matrix.tsv', sep='\t', index=False)

#     give_test = lambda obj: obj.loc[np.random.choice(obj.index, len(obj.index) // 10), :]
#     test_data = user_item_matrix.groupby('userId', as_index=False).apply(give_test).reset_index(level=0, drop=True)
#     train_data = user_item_matrix[~user_item_matrix.index.isin(test_data.index)]

#     # user item stats
#     all_data = user_item_matrix
#     num_user = len(all_data['userId'].unique()) + 1
#     num_item = len(all_data['movieID'].unique()) + 1

#     # convert input to torch tensors
#     train_tensors = [torch.tensor(columnData.values, device=device) for _, columnData in train_data.iteritems()]
#     test_tensors = [torch.tensor(columnData.values, device=device) for _, columnData in test_data.iteritems()]

#     # convert tensors to dataloader
#     train_dataset = data.TensorDataset(*train_tensors)
#     train_loader = data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
#     test_dataset = data.TensorDataset(*test_tensors)
#     test_loader = data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
#     print('train_samples:{} \t test_samples:{} \t num_user:{} \t num_item:{}'.format(train_data.shape[0], test_data.shape[0], num_user, num_item))
#     return train_loader, test_loader, num_user, num_item

65133


array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.