In [1]:
import pandas as pd
import numpy as np
import random
seed = 123
random.seed(seed)

## Load and process ML20M

In [2]:
ml20m_data_path = '../data/ml-20m/'

ml_20m_movies = pd.read_csv(ml20m_data_path + 'movies.csv')
ml_20m_tag = pd.read_csv(ml20m_data_path + 'tags.csv')

## Create Movie Name to Tag df
- That way it is usable by ml1m

In [3]:
movieid2tag = {}
for movieID, tag in zip(ml_20m_tag['movieId'], ml_20m_tag['tag']):
    if movieID not in movieid2tag:
        movieid2tag[movieID] = set()
    movieid2tag[movieID].add(tag)
movieid2moviename = {movieID: moviename for movieID, moviename in zip(ml_20m_movies['movieId'], ml_20m_movies['title'])}

In [4]:
total_tags = len(set(ml_20m_tag['tag']))
total_tagged_movies = len(set(ml_20m_tag['movieId']))
total_movies = len(set(ml_20m_movies['movieId']))
print('Total tags:', total_tags, 'Total tagged movies:', total_tagged_movies, 'Total movies:', total_movies)

Total tags: 38644 Total tagged movies: 19545 Total movies: 27278


### Movieid2tag -> Movie2tag

In [5]:
movie_list = []
tag_list_of_list = []
for movieID, tags in movieid2tag.items():
    movie_list.extend([movieid2moviename[movieID]] * len(tags))
    tag_list_of_list.extend(list(tags))

tag_df = pd.DataFrame({'title': movie_list, 'tag': tag_list_of_list})

## Save this Movie to Tag data for further processing.

In [6]:
#save the data
ml1m_data_path = '../data/ml-1m/'
tag_df.to_csv(ml20m_data_path + 'all_tag.csv', index=False)
tag_df.to_csv(ml1m_data_path + 'all_tag.csv', index=False)


## all_tag.csv -> movieid2tags only for ml1m

### Load and process ml-1m

In [7]:
ml_1m_movies = pd.read_csv(ml1m_data_path + 'movies.dat', sep='::', header=None, engine='python', names=['movieId', 'title', 'genres'], encoding='latin-1')
movie_title2idx = {}
for movieID, title in zip(ml_1m_movies['movieId'], ml_1m_movies['title']):
    movie_title2idx[title] = movieID

movieid_list = []
tag_list = []
all_tags = pd.read_csv(ml1m_data_path + 'all_tag.csv')
for title, tag in zip(all_tags['title'], all_tags['tag']):
    if title in movie_title2idx:
        movieid_list.append(movie_title2idx[title])
        tag_list.append(tag)

pd.DataFrame({'movieId': movieid_list, 'tag': tag_list}).to_csv(ml1m_data_path + 'tags.csv', index=False)

## Prepare the tag data for the training data
- tag2id.csv
- tag2movie.csv

In [21]:
tags_df = pd.read_csv(ml1m_data_path + 'tags.csv')
movieid2idx_df = pd.read_csv(ml1m_data_path + 'movie2id.csv')
movieid2idx = {movie: idx for movie, idx in zip(movieid2idx_df['original_movieId'], movieid2idx_df['movieId'])}

In [22]:
def get_tag2movieid(tags_df):
    tag2movieid = {}
    for movieid, tag in zip(tags_df.movieId, tags_df.tag):
        tag = str(tag).lower()
        if movieid in movieid2idx:
            if tag in tag2movieid:
                tag2movieid[tag].add(movieid2idx[movieid])
            else:
                tag2movieid[tag] = {movieid2idx[movieid]}
        else:
            print('Movie ID not found:', movieid)
    return tag2movieid

In [23]:
tag2movieid = get_tag2movieid(tags_df)
tag_vocab = tag2movieid.keys()
tag2id = {tag: i for i, tag in enumerate(tag_vocab)}
tagid2movieid = {tag2id[tag]: list(movieid_set) for tag, movieid_set in tag2movieid.items()}

Movie ID not found: 636
Movie ID not found: 636
Movie ID not found: 636
Movie ID not found: 1140
Movie ID not found: 1140
Movie ID not found: 3278
Movie ID not found: 3278
Movie ID not found: 2270
Movie ID not found: 2270
Movie ID not found: 2270
Movie ID not found: 2270
Movie ID not found: 2270
Movie ID not found: 2270
Movie ID not found: 2270
Movie ID not found: 2270
Movie ID not found: 2270
Movie ID not found: 2270
Movie ID not found: 2270
Movie ID not found: 2270
Movie ID not found: 2270
Movie ID not found: 2270
Movie ID not found: 2270
Movie ID not found: 2270
Movie ID not found: 2270
Movie ID not found: 2270
Movie ID not found: 2270
Movie ID not found: 2270
Movie ID not found: 2270
Movie ID not found: 2270
Movie ID not found: 3226
Movie ID not found: 2601
Movie ID not found: 395
Movie ID not found: 395
Movie ID not found: 629
Movie ID not found: 1765
Movie ID not found: 3369
Movie ID not found: 983
Movie ID not found: 983
Movie ID not found: 1628
Movie ID not found: 683
Movie ID 

In [24]:
tag2movieid

{'dennis hopper': {201, 264, 495, 776, 794, 1094, 1466, 1739, 2264, 3017},
 'sf': {22, 209, 217, 322, 538, 662, 714, 724, 743, 776, 951},
 'steampunk': {738, 776, 929, 1014},
 'very boring': {776},
 'post apocalyptic': {132,
  211,
  220,
  359,
  666,
  776,
  967,
  996,
  1014,
  1053,
  1601,
  1930,
  2714},
 'post apocalypse': {776},
 'better than everybody thinks': {126, 776},
 'dark hero': {132, 317, 375, 587, 647, 743, 776},
 'x-mas movie': {693, 776},
 'post-apocalyptic': {132,
  211,
  220,
  467,
  528,
  666,
  748,
  759,
  776,
  853,
  856,
  967,
  996,
  1014,
  1053,
  1930,
  2714,
  2775,
  3144},
 'water': {204, 220, 277, 559, 719, 776, 1712, 1868, 2032, 2732},
 'extended edition': {776},
 'watch': {41,
  59,
  130,
  145,
  174,
  187,
  322,
  328,
  336,
  383,
  448,
  531,
  661,
  668,
  729,
  776,
  1088,
  1202,
  2222},
 'clv': {2,
  7,
  8,
  9,
  11,
  12,
  14,
  17,
  19,
  22,
  26,
  27,
  37,
  39,
  40,
  44,
  46,
  49,
  54,
  55,
  56,
  57,
 

In [None]:
def dict2df(_dict, key_name='key', value_name='value'):
    key_list = []
    value_list = []
    for key, value in _dict.items():
        key_list.extend([key] * len(value))
        value_list.extend(list(value))
    return pd.DataFrame({key_name: key_list, value_name: value_list})

In [None]:
tagid2movieid_df = dict2df(tagid2movieid, key_name='tagId', value_name='movieId')
tag2id_df = pd.DataFrame({'tag': list(tag2id.keys()), 'tagId': list(tag2id.values())})
tag2id_df.to_csv(ml1m_data_path + 'tag2id.csv', index=False)
tagid2movieid_df.to_csv(ml1m_data_path + 'tag2movie.csv', index=False)