# Mapping movielens to Amazon

In [180]:
import csv
import pandas as pd
import json
import re
import inflect

In [181]:
generator = inflect.engine()

In [182]:
def load_json(filename):
    '''
    To load Json files
    :param filename : filename to load
    :return the object from Json file
    '''
    
    with open(filename) as f:
        data = json.load(f)
    
    return data

def dump_json(filename,obj_to_dump):
    '''
    To dump (mainly) dictionaries to Json for further processing
    :param filename : filename to save the jsonfile
    '''
    
    with open(filename, 'w') as fp:
        json.dump(obj_to_dump, fp)
    fp.close()
    

def get_title(string):
    string = string.split(': ')
    if len(string)>4:
        return ': '.join(string[1:-2]).strip()
    else:
        return string[1].strip()
    
    

## Loading Data and variables needed for further process

In [183]:
amazon_movie_id = load_json('Data/Offline_Files/amazon_movie_id_2.json')
movielens_movies  = load_json('Data/Offline_Files/movielens_movies.json')
movielens_imdb_mapping_title_aka = load_json('Data/Offline_Files/movielens_imdb_mapping_title_aka.json')
amazon_unique_meta = load_json('Data/Offline_Files/amazon_unique_meta.json')

In [184]:
# there are 4552 titles which are in a form of script (not correctly extracted in the meta_tv_movies file)
amazon_htmlscript_title = {}
count = 0
for item in amazon_movie_id:
    if amazon_movie_id[item].startswith('var aPageStart'):
        count = count + 1
        for line in (amazon_movie_id[item].split('\n')):
            if line.startswith('Amazon.com:'):
                amazon_htmlscript_title[item] = {'asin' : item, 'title': amazon_movie_id[item]}
                amazon_movie_id[item] = get_title(line)
    if amazon_movie_id[item] == 'N/A':
        count = count + 1
print('Total Amazon ASIN which does not have title = ' + str(count) + ' which is ' + str(count/len(amazon_movie_id)*100) +'%')

Total Amazon ASIN which does not have title = 270 which is 0.14831660651604292%


In [185]:
#extracting aka list from imdb
main_imdb_aka = {} #Main_imdb_aka is for original, and primary title in imdb
secondary_imdb_aka = {} #secondary_imdb_aka is for all aka from imdb title (apart from the original and primary titlen of imdb)
for key in movielens_imdb_mapping_title_aka:    
    list_main_aka_temp = []
    imdb_primary_title = movielens_imdb_mapping_title_aka[key]['imdb_primaryTitle'].lower().strip()
    imdb_original_title = movielens_imdb_mapping_title_aka[key]['imdb_originalTitle'].lower().strip()
    for item in [imdb_primary_title,imdb_original_title]:
        if item !='n/a':
            list_main_aka_temp.append(imdb_primary_title)
            
    #if list_main_aka_temp != []:
    main_imdb_aka[key] = list(dict.fromkeys(list_main_aka_temp))
    secondary_imdb_aka[key] = list(dict.fromkeys([x.lower().strip() for x in movielens_imdb_mapping_title_aka[key]['imdb_aka'] if x.lower().strip()!='n/a']))

## Text pre_processing functions

In [186]:
def remove_ends_with(title_text):
    '''
    remove 'vhs' or '[vhs]'' or '': widescreen', dvd, or [dvd] edition at the end of title
    '''
    list_ends_with = ['vhs','[vhs]',': widescreen edition','dvd','[dvd]','(dvd)','unrated version']
    for word in list_ends_with:
        if title_text.endswith(word):
            title_text = title_text.replace(word,'').strip()
    return title_text

def remove_sqr_bracket_content(title_text):
    '''
    Remove square brackets and its contents 
    '''
    return re.sub(r'\[([^)]+)\]','',title_text).strip()

def rearrange_article(title_text):
    '''
    Re-arrange the placement of article such as 'alchemist, the' to become 'the alchemist'
    '''
    articles = ["the", "a", "an", "la", "le","les", "il","el", "l'"]
    
    for article in articles:
        if title_text.endswith(', '+article):
            if article == "l'":
                title_text = article + title_text.replace(', '+article,'').strip()
            else:
                title_text = article + ' ' + title_text.replace(', '+article,'').strip()
    return title_text

def replace_and_symbol(title_text):
    '''
    Replace '&' to 'and'
    '''
    if '&' in title_text:
        title_text = title_text.replace('&','and')
    
    return title_text

def remove_symbols(title_text):
    symbols = [",","?","!","."]
    
    for sym in symbols:
        title_text = title_text.replace(sym,'').strip()
        
    if title_text.startswith("'"):
        title_text = title_text[1:]
        
    return title_text

def remove_multi_spaces(title_text):
    return ' '.join(title_text.split())

def rearrange_colon(title_text):
    '''
    to make the writing of ':' uniform as sometimes it has space before, sometimes it is after, sometimes both
    '''
    if ':' in title_text:
        title_text = ': '.join([x.strip() for x in title_text.split(':')])
    
    return title_text

def convert_latin_to_roman_numeral(title_text):
    '''
    Fine all number and replace with roman numeral 
    '''
    temp = re.findall(r'\d+', title_text) 
    res = list(map(int, temp)) 
    if res != []:
        for item in res:
            if item<20:
                title_text = title_text.replace(str(item),str(int_to_Roman(item))).lower()
        return [title_text] 
    else:
        return []
    
def convert_latin_to_roman_numeral(title_text):
    '''
    Fine all number and replace with roman numeral 
    '''
    temp = re.findall(r'\d+', title_text) 
    res = list(map(int, temp)) 
    if res != []:
        to_roman = title_text
        to_word = title_text
        for item in res:
            to_word = to_word.replace(str(item),generator.number_to_words(item))
            if item<20:
                to_roman = to_roman.replace(str(item),str(int_to_Roman(item))).lower()
        return [to_roman,to_word] 
    else:
        return []
    

def get_integer_to_roman(text,res): #will be deleted
    for item in res:
        if item<20:
            text = text.replace(str(item),str(int_to_Roman(item)))
    
    return text.lower()

def int_to_Roman(num):
    val = (1000, 900,  500, 400, 100,  90, 50,  40, 10,  9,   5,  4,   1)
    syb = ('M',  'CM', 'D', 'CD','C', 'XC','L','XL','X','IX','V','IV','I')
    roman_num = ""
    for i in range(len(val)):
        count = int(num / val[i])
        roman_num += syb[i] * count
        num -= val[i] * count
    return roman_num


def remove_year_in_parentheses(title_text):
    '''
    to remove year in format '(1990)' in title (movielens title)
    '''
    return re.sub(r'\([0-9][0-9][0-9][0-9]\)','',title_text).lower().strip()

def extract_aka_from_title(title_text):
    '''
    Movielens title can contain aka, most of the cases written in parentheses such as ....
    '''
    starts_with = ['aka ', 'a.k.a ', 'a.k.a. ']
    title_aka = []
    if title_text.endswith(')'):
        title_aka = title_aka + re.findall(r'\(([^)]+)\)',title_text)
        title_aka = [x.strip() for x in title_aka]
        for j in range(len(title_aka)):
            for start in starts_with:
                if title_aka[j].startswith(start):
                    title_aka[j] = title_aka[j].replace(start,'').strip()
        
        #to split the item beforee paranthes and in parentheses
        title_text = re.sub(r'\(([^)]+)\)','',title_text).strip()
        
    return title_text,title_aka

def rearrange_colon_movielens(title_text):
    '''
    rearrange how colon (:) is written so it is uniform, and also add aka with replacing ':' with '-'
    '''
    aka = []
    
    if ':' in title_text:
        title_text = ': '.join([x.strip() for x in title_text.split(':')])
        #add aka with ':' replaced with '-'
        aka = [title_text.replace(': ', ' - '),title_text.replace(': ', ' ')]
    
    return title_text,aka

def remove_begins_with(title_text):
    text_split = title_text.split()
    return_text = []
    if len(text_split)>1:
        if (text_split[0] == 'a') or (text_split[0] == 'the') or (text_split[0] == 'an'):
            return_text = [' '.join(text_split[1:])]
    return return_text

def remove_colon_hypen(title_text):
    if ':' in title_text:
        title_text = ' '.join([x.strip() for x in title_text.split(':')])
    if '-' in title_text:
        title_text = ' '.join([x.strip() for x in title_text.split('-')])
    return title_text

def convert_abbreviation(title_text):
    if 'dr.' in title_text:
        title_text = title_text.replace('dr.','doctor')
        return [title_text]
    else:
        return []

def get_partial(title_text):
    return_list = []
    if ':' in title_text:
        temp_text = [x.strip() for x in title_text.split(':')]
        if (temp_text[0][-1].isdigit()) & (len(temp_text)==2):
            return_list = [x for x in temp_text if x!='']
   
    return return_list
    

## Movielens Title Preprocessing Function

In [187]:
def preprocessing_movielens_title_3(dictionary_movielens_data):
    preprocessed_movielens_dictionary = {}
    for key in dictionary_movielens_data:
        title = dictionary_movielens_data[key]
        
        #remove year in the format of (xxxx), and remove spaces at beginning and end of the title
        title = remove_year_in_parentheses(title)
        
        title, title_aka = extract_aka_from_title(title)

        #only include main_imdb_aka to avoid unnecessary matching
        title = [title]+title_aka+main_imdb_aka.get(key,[])
        
        # Add AKA (4-4) replace : with. - 
        additional_aka = []
        for i in range(len(title)):
            title[i] = rearrange_article(title[i])
               
            # check for & symbol
            title[i] = replace_and_symbol(title[i])
            
            # Remove symbols in symbol list
            title[i]= remove_symbols(title[i]) 
            
            #remove more than one spaces
            title[i] = remove_multi_spaces(title[i])
            
            additional_aka = additional_aka + get_partial(title[i])
            
            title[i] = remove_colon_hypen(title[i])
            
            additional_aka = additional_aka + convert_latin_to_roman_numeral(title[i])
            
            #replace abbreviation such as dr.
            additional_aka = additional_aka + convert_abbreviation(title[i])
            
        title = list(dict.fromkeys(title+additional_aka))
        
        second_additional_aka = []
        for i in range(len(title)):
            second_additional_aka = second_additional_aka + remove_begins_with(title[i])
        
        title = list(dict.fromkeys(title+second_additional_aka))
        
        for i in range(len(title)):
            title[i] = title[i].replace(' ','')
            
        #for secondary_imdb_aka, 
        secondary_imdb_aka_list = secondary_imdb_aka.get(key,[])
        for i in range(len(secondary_imdb_aka_list)):
            secondary_imdb_aka_list[i] = secondary_imdb_aka_list[i].lower().strip()
            secondary_imdb_aka_list[i] = rearrange_article(secondary_imdb_aka_list[i])
            # check for & symbol
            secondary_imdb_aka_list[i] = replace_and_symbol(secondary_imdb_aka_list[i])
            
            # Remove symbols in symbol list
            secondary_imdb_aka_list[i]= remove_symbols(secondary_imdb_aka_list[i]) 
            
            #remove more than one spaces
            secondary_imdb_aka_list[i] = remove_multi_spaces(secondary_imdb_aka_list[i])
            
            secondary_imdb_aka_list[i] = remove_colon_hypen(secondary_imdb_aka_list[i])
            
        preprocessed_movielens_dictionary[key] = {'title_list':list(dict.fromkeys(title)), 'secondary_aka':secondary_imdb_aka_list}


    return preprocessed_movielens_dictionary

## Amazon Title Preprocessing Function

In [192]:
def preprocessing_amazon_title_3(dictionary_amazon_data):
    preprocessed_amazon_dictionary = {}
    for key in dictionary_amazon_data:
        
        #lowercase and remove spaces at the beginning and end
        preprocessed_title = dictionary_amazon_data[key].lower().strip()
        
        # Check if the item contains '/' or 'collection', as probably they are sets of movies
        if ('/' in preprocessed_title) | ('collection' in preprocessed_title):
            collection_flag = 'y'
        else:
            collection_flag = 'n'
        
        # Remove 'vhs' or '[vhs]'' or '': widescreen', dvd, or [dvd] edition at the end of title
        preprocessed_title = [remove_ends_with(preprocessed_title)]
          
        additional_aka = []
        
        for i in range(len(preprocessed_title)):
            
            #wordsin [] usually is not important for title match such as '[Snow Dogs [VHS] [Import]]', so remove all []
            preprocessed_title[i] = remove_sqr_bracket_content(preprocessed_title[i])
            
            # Re-arrange articles (the, a, an, le, la, les, il, l')
            preprocessed_title[i] = rearrange_article(preprocessed_title[i])
                
            # check for & symbol
            preprocessed_title[i] = replace_and_symbol(preprocessed_title[i])
                
            # Remove symbols in symbol list
            preprocessed_title[i]= remove_symbols(preprocessed_title[i]) 
            
            #remove more than one spaces
            preprocessed_title[i] = remove_multi_spaces(preprocessed_title[i])
            
            preprocessed_title[i] = remove_colon_hypen(preprocessed_title[i])
             
            additional_aka = additional_aka + convert_latin_to_roman_numeral(preprocessed_title[i])
            
            additional_aka = additional_aka + convert_abbreviation(preprocessed_title[i])
            
        preprocessed_title = list(dict.fromkeys(preprocessed_title + additional_aka))
    
        second_additional_aka = []
        for i in range(len(preprocessed_title)):
             second_additional_aka.append(remove_multi_spaces(re.sub(r'\(([^)]+)\)','',preprocessed_title[i]).strip()))
            
        preprocessed_title = list(dict.fromkeys(preprocessed_title + second_additional_aka))
        
        for i in range(len(preprocessed_title)):
            preprocessed_title[i] = preprocessed_title[i].replace(' ','')
        
        # get unique list of title and aka    
        preprocessed_amazon_dictionary[key]= {'title_list':list(dict.fromkeys(preprocessed_title)),'collection':collection_flag}
    return preprocessed_amazon_dictionary

## Run Preprocessing

In [193]:
preprocessed_amazon_movie_id = preprocessing_amazon_title_3(amazon_movie_id)

In [194]:
dump_json('Data/Offline_Files/preprocessed_amazon_movie_id_4_10.json',preprocessed_amazon_movie_id)

In [195]:
preprocessed_movielens_movies = preprocessing_movielens_title_3(movielens_movies)

In [196]:
dump_json('Data/Offline_Files/preprocessed_movielens_movies_4_10.json',preprocessed_movielens_movies)