In [None]:
import json
import csv

## Functions

In [None]:
def load_json(filename):
    '''
    To load Json files
    :param filename : filename to load
    :return the object from Json file
    '''
    
    with open(filename) as f:
        data = json.load(f)
    
    return data

def dump_json(filename,obj_to_dump):
    '''
    To dump (mainly) dictionaries to Json for further processing
    :param filename : filename to save the jsonfile
    '''
    
    with open(filename, 'w') as fp:
        json.dump(obj_to_dump, fp)
    fp.close()
    

def get_title(string):
    string = string.split(': ')
    if len(string)>4:
        return ': '.join(string[1:-2]).strip()
    else:
        return string[1].strip()
    

In [None]:
def mapping_amazon_movielens(success_filename,not_success_filename,preprocessed_movielens,preprocessed_amazon,movielens_dict,amazon_dict,type='exact',match_list='main'):
    '''
    Function for mapping amazon <-> movielens movies
    :param output_filename : filename for the tsv file for output
    :param preprocessed_movielens : dictionary of preprocessed movielens title
    :param preprocessed_amazon : dictionary of preprocessed amazon title
    :param movielens_dict : dictionary of movielens id with original title
    :param amazon_dict : dictionary of amazon asin with original title
    :param type : 'exact' of 'contain'. exact if the match is using exact match ('='), contain if the match using partly match ('in')
    :output :
    
    '''
    success_mapped_movielens_amazon = {}
    movielens_not_in_amazon = {}

    match_list_title = 'movielens_preprocessed_title' if match_list == 'main' else 'movielens_secondary_aka'
    movielens_field = 'title_list' if match_list =='main' else 'secondary_aka'
    
    with open('Data/Mapping_files/TSV/'+success_filename, 'wt') as out_file:
        tsv_writer = csv.writer(out_file, delimiter='\t')
        tsv_writer.writerow(['movielens_id', 'movielens_title',match_list_title, 'amazon_asin', 'amazon_title','amazon_preprocessed_title'])
              
        for i,movielens_key in enumerate(preprocessed_movielens):
            if i%1000 == 0:
                print(str(i) + ' data matched from '+ str(len(preprocessed_movielens)))
                
           
            preprocessed_movielens_title = preprocessed_movielens[movielens_key][movielens_field]
    
            movielens_title = movielens_dict[movielens_key]
            there_is_match = False
            for amazon_key in preprocessed_amazon:
                match = 0
                #if preprocessed_amazon_movie_id[amazon_key] == preprocessed_movielens_movies[movielens_key]:
                for each_azn_aka in preprocessed_amazon[amazon_key]['title_list']:
                    if match>0:
                        break
                    #for each_ml_aka in preprocessed_movielens_movies[movielens_key]['title_list']:
                    for each_ml_aka in preprocessed_movielens_title:
                        if type=='exact':
                            if each_azn_aka == each_ml_aka:
                                match = match +1
                        else:
                            if each_ml_aka in each_azn_aka:
                                match = match + 1
                                
                    #write to dictionary
                if match>0:
                    there_is_match = True
                    if movielens_key not in success_mapped_movielens_amazon:
                        success_mapped_movielens_amazon[movielens_key] = {'movielens_title': movielens_title, 
                                                                          match_list_title :preprocessed_movielens_title,
                                                                          'matched' : [(amazon_key,amazon_dict[amazon_key],preprocessed_amazon[amazon_key]['title_list'])]
                                                                         }
                    else:
                         success_mapped_movielens_amazon[movielens_key]['matched'].append((amazon_key,amazon_dict[amazon_key],preprocessed_amazon[amazon_key]['title_list']))

                    #write to tsv file
                    tsv_writer.writerow([movielens_key,movielens_title, preprocessed_movielens_title, amazon_key,amazon_dict[amazon_key], preprocessed_amazon[amazon_key]['title_list']])
            if there_is_match == False:
                movielens_not_in_amazon[movielens_key] = {'movielens_title': movielens_title, match_list_title:preprocessed_movielens_title}
    
    out_file.close()
    
    dump_json('Data/Mapping_files/JSON/' + success_filename.split('.')[0] + '.json',success_mapped_movielens_amazon)
    dump_json('Data/Mapping_files/JSON/' + not_success_filename.split('.')[0] + '.json',movielens_not_in_amazon)
    print('Number of mapped movielens ID: ' + str(len(success_mapped_movielens_amazon)))
    print('Number of not mapped ml ID: ' + str(len(movielens_not_in_amazon)))
    
    #Write the not_mapped movielens_id 
    with open('Data/Mapping_files/TSV/'+not_success_filename, 'wt') as out_file:
        tsv_writer = csv.writer(out_file, delimiter='\t')
        tsv_writer.writerow(['movielens_id', 'movielens_title',match_list_title])
        for key,value in movielens_not_in_amazon.items():
            tsv_writer.writerow([key, value['movielens_title'],value[match_list_title]])
    out_file.close()
    
    return success_mapped_movielens_amazon, movielens_not_in_amazon

## Load the files and parameters needed for Matching

In [None]:
preprocessed_movielens_movies = load_json('Data/Offline_Files/preprocessed_movielens_movies_4_10.json')
movielens_movies = load_json('Data/Offline_Files/movielens_movies.json')
preprocessed_amazon_movie_id =  load_json('Data/Offline_Files/preprocessed_amazon_movie_id_4_10.json')
amazon_movie_id = load_json('Data/Offline_Files/amazon_movie_id_2.json')

### Match amazon data to movielens 
Create tsv file (sorted by movielensID) - MovielensID, Movielens Title, Amazon ASIN, Amazon title

In [None]:
offline_file = False #Set offline_file to false to do new matches, or True to load available json files

if offline_file == True:
    success_mapped_movielens_amazon = load_json('Data/Mapping_files/JSON/mapped_ml_azn_4_10.json')
    not_mapped_movielens = load_json('Data/Mapping_files/JSON/not_mapped_ml_4_10.json')
else:
    success_mapped_movielens_amazon,not_mapped_movielens = mapping_amazon_movielens('mapped_ml_azn_4_10.tsv','not_mapped_ml_4_10.tsv',preprocessed_movielens_movies,preprocessed_amazon_movie_id,movielens_movies,amazon_movie_id,type='exact')

In [None]:
#Printing general summary of mapping
print('Number of mapped movielens data : ' + str(len(success_mapped_movielens_amazon)))
matched_asin = []
for key in success_mapped_movielens_amazon:
    matched_asin = matched_asin + [x[0] for x in success_mapped_movielens_amazon[key]['matched']]
    
print('Number of times ASIN matched : ' + str(len(matched_asin)))

print('Number of unique ASIN matched: ' + str(len(set(matched_asin))))
    

### Secondary match (for further match, not used right now)

In [None]:
#Getting not mapped preprocessed movielens and preprocessed amazon
preprocessed_movielens_notmapped = {}
for key in preprocessed_movielens_movies:
    if key not in success_mapped_movielens_amazon:
        preprocessed_movielens_notmapped[key] = preprocessed_movielens_movies[key]
dump_json('Data/Offline_Files/preprocessed_movielens_notmapped_4_10.json',preprocessed_movielens_notmapped)
        
        
preprocessed_amazon_movie_notmapped = {}
for key in preprocessed_amazon_movie_id:
    if key not in matched_asin:
        preprocessed_amazon_movie_notmapped[key]=preprocessed_amazon_movie_id[key]
dump_json('Data/Offline_Files/preprocessed_amazon_movie_notmapped_4_10.json',preprocessed_amazon_movie_notmapped)

In [None]:
#Do secondary map, with the previously not mapped movieelens and amazon

In [None]:
secondary_mapped,secondary_not_mapped_ml = mapping_amazon_movielens('second_mapped_ml_azn_4_8.tsv','second_not_mapped_ml_4_8.tsv',preprocessed_movielens_notmapped,preprocessed_amazon_movie_notmapped,movielens_movies,amazon_movie_id,type='exact',match_list='secondary')

secondary not mapped means the movie is not available in amazon (not even in collection)