### Format and Tokenize movie information

In [1]:
import pandas as pd
import numpy as np
import ast
import random
import json

In [2]:
dataset = pd.read_csv('tmdb_5000_movies.csv')
dataset_bkp = dataset.copy()

I want to recommend based on the genre, keywords and the language of the movie

In [3]:
dataset = dataset.drop(columns=['id', 'budget', 'homepage', 'original_title', 'overview', 'release_date', 'popularity', 'production_companies', 'revenue', 'runtime', 'tagline', 'title', 'vote_average', 'vote_count', 'status', 'spoken_languages', 'production_countries'])
dataset.head()

Unnamed: 0,genres,keywords,original_language
0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en
1,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en
2,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en
3,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en
4,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en


In [4]:
def unpack(row):
    arr = []
    for item in ast.literal_eval(row):
        arr.append(item['name'].lower())
    return arr

In [5]:
dataset['genres'] = dataset['genres'].apply(unpack)
dataset['keywords'] = dataset['keywords'].apply(unpack)
dataset['original_language'] = dataset['original_language'].apply(lambda x: [x])
dataset.head()

Unnamed: 0,genres,keywords,original_language
0,"[action, adventure, fantasy, science fiction]","[culture clash, future, space war, space colon...",[en]
1,"[adventure, fantasy, action]","[ocean, drug abuse, exotic island, east india ...",[en]
2,"[action, adventure, crime]","[spy, based on novel, secret agent, sequel, mi...",[en]
3,"[action, crime, drama, thriller]","[dc comics, crime fighter, terrorist, secret i...",[en]
4,"[action, adventure, science fiction]","[based on novel, mars, medallion, space travel...",[en]


In [6]:
list_gen = []
list_keywords = []
list_lang = []

dataset['genres'].apply(lambda col: list_gen.extend(col))
dataset['keywords'].apply(lambda col: list_keywords.extend(col))
dataset['original_language'].apply(lambda col: list_lang.extend(col))

list_gen = list(dict.fromkeys(list_gen))
list_keywords = list(dict.fromkeys(list_keywords))
list_lang = list(dict.fromkeys(list_lang))

print('No of unique genres ->', len(list_gen))
print('No of unique keywords ->', len(list_keywords))
print('No of unique languages ->', len(list_lang))

No of unique genres -> 20
No of unique keywords -> 9813
No of unique languages -> 37


In [7]:
org_values_list = list_gen + list_keywords + list_lang
org_values_list = list(dict.fromkeys(org_values_list))
all_tokens_list = ['tok' + str(i) for i in range(len(org_values_list))]

token_to_value = dict(zip(all_tokens_list, org_values_list))
value_to_token = dict(zip(org_values_list, all_tokens_list))

In [8]:
value_to_token

{'action': 'tok0',
 'adventure': 'tok1',
 'fantasy': 'tok2',
 'science fiction': 'tok3',
 'crime': 'tok4',
 'drama': 'tok5',
 'thriller': 'tok6',
 'animation': 'tok7',
 'family': 'tok8',
 'western': 'tok9',
 'comedy': 'tok10',
 'romance': 'tok11',
 'horror': 'tok12',
 'mystery': 'tok13',
 'history': 'tok14',
 'war': 'tok15',
 'music': 'tok16',
 'documentary': 'tok17',
 'foreign': 'tok18',
 'tv movie': 'tok19',
 'culture clash': 'tok20',
 'future': 'tok21',
 'space war': 'tok22',
 'space colony': 'tok23',
 'society': 'tok24',
 'space travel': 'tok25',
 'futuristic': 'tok26',
 'space': 'tok27',
 'alien': 'tok28',
 'tribe': 'tok29',
 'alien planet': 'tok30',
 'cgi': 'tok31',
 'marine': 'tok32',
 'soldier': 'tok33',
 'battle': 'tok34',
 'love affair': 'tok35',
 'anti war': 'tok36',
 'power relations': 'tok37',
 'mind and soul': 'tok38',
 '3d': 'tok39',
 'ocean': 'tok40',
 'drug abuse': 'tok41',
 'exotic island': 'tok42',
 'east india trading company': 'tok43',
 "love of one's life": 'tok44

In [9]:
def tokenize(row):
    arr = []
    for item in row:
        arr.append(value_to_token[item])
    return arr

In [10]:
dataset['genres'] = dataset['genres'].apply(tokenize)
dataset['keywords'] = dataset['keywords'].apply(tokenize)
dataset['original_language'] = dataset['original_language'].apply(tokenize)
dataset.head()

Unnamed: 0,genres,keywords,original_language
0,"[tok0, tok1, tok2, tok3]","[tok20, tok21, tok22, tok23, tok24, tok25, tok...",[tok9813]
1,"[tok1, tok2, tok0]","[tok40, tok41, tok42, tok43, tok44, tok45, tok...",[tok9813]
2,"[tok0, tok1, tok4]","[tok56, tok57, tok58, tok59, tok60, tok61, tok62]",[tok9813]
3,"[tok0, tok4, tok5, tok6]","[tok63, tok64, tok65, tok66, tok67, tok68, tok...",[tok9813]
4,"[tok0, tok1, tok3]","[tok57, tok84, tok85, tok25, tok86, tok28, tok...",[tok9813]


In [11]:
dataset = dataset.apply(lambda row: np.concatenate((row['genres'], row['keywords'], row['original_language'])), axis=1)
dataset.head()

0    [tok0, tok1, tok2, tok3, tok20, tok21, tok22, ...
1    [tok1, tok2, tok0, tok40, tok41, tok42, tok43,...
2    [tok0, tok1, tok4, tok56, tok57, tok58, tok59,...
3    [tok0, tok4, tok5, tok6, tok63, tok64, tok65, ...
4    [tok0, tok1, tok3, tok57, tok84, tok85, tok25,...
dtype: object

In [12]:
dataset_bkp['movie_desc'] = dataset.apply(lambda row: ' '.join(row))
dataset_bkp.to_csv('processed_movie_details.csv', index=False)

token_to_value_json = json.dumps(token_to_value)
value_to_token_json = json.dumps(value_to_token)

f = open('token_to_value.json','w')
f.write(token_to_value_json)
f.close()

f = open('value_to_token.json','w')
f.write(value_to_token_json)
f.close()

### Generating Synthetic data

Let's assume we take last 10 movies from user's history to get his current choice

In [13]:
random.seed = 30
max_movies_per_person = 10
len_movie_list = dataset.shape[0]
synthetic_dataset = []

In [14]:
for i in range(15000):
    history = []
    for j in range(random.randint(1, max_movies_per_person)):
        history += list(dataset[random.randint(0, len_movie_list - 1)])
    
#     history = list(dict.fromkeys(history))
    synthetic_dataset.append(history)

In [15]:
pd.Series(synthetic_dataset).apply(lambda row: ' '.join(row)).to_csv('user_data.csv', header='user_history', index=False)