# Extracting information

In [None]:
import pandas as pd
import csv
import json
import pickle

In [50]:
def dump_json(filename,obj_to_dump):
    '''
    To dump (mainly) dictionaries to Json for further processing
    :param filename : filename to save the jsonfile
    '''
    
    with open('data/TSV_files/'+filename, 'w') as fp:
        json.dump(obj_to_dump, fp)
    fp.close()
    
def load_json(filename):
    '''
    To load Json files
    :param filename : filename to load
    :return the object from Json file
    '''
    
    with open('data/TSV_files/'+filename) as f:
        data = json.load(f)
    
    return data

# Read MovieLens data

To extract the MovieLens movie ID and title to tsv file

In [16]:
#Load file in pandas dataframe just for analysis

#Contains mapping movielensID to imdbID and tmdbID
links = pd.read_csv('data/Movielens/ml-20m/links.csv')

#Contains movie id, title and genre
movies = pd.read_csv('data/Movielens/ml-20m/movies.csv')

In [17]:
def read_movielens_to_tsv(file_output_tsv,file_input = "movies.csv"):
    '''
    To extract movielens movie ID and title, and save to tsv file
    :param file_input: filename for the input (to be extracted)
    :param file_output : filename to save tsv file
    :output : tsv file, and dictionary contains same information (movielens movieId and title)
    '''
    movielens_movies = {}
    with open("data/Movielens/ml-20m/" + file_input) as f:
        reader = csv.reader(f, delimiter=',', quotechar='\"')
        next(reader)
        for row in reader:
            movielens_movies[row[0]] = row[1]
    f.close()
    
    #Write to TSV files
    with open("data/TSV_files/"+ file_output_tsv, 'wt') as out_file:
        tsv_writer = csv.writer(out_file, delimiter='\t')
        tsv_writer.writerow(['movieId', 'title'])
        for key in movielens_movies:
            tsv_writer.writerow([key, movielens_movies[key]])
    f.close()
    
    dump_json(file_output_tsv.split('.')[0]+'.json',movielens_movies)
    
    
    return movielens_movies

In [18]:
movielens_movies = read_movielens_to_tsv("movielens_movies.tsv")

# Read IMDB Data for ID mapped from MovieLens

To extract IMDB title and title aka, title

In [38]:
imdb_movies_basic = {}
with open("data/Imdb/data-basic.tsv") as f:
    next(f)
    for line in f:
        text = line.split('\t')
        imdb_movies_basic[text[0]] = {'primaryTitle':text[2],'originalTitle':text[3]}
f.close()

imdb_aka = {}
with open("data/Imdb/data-aka.tsv") as f:
    next(f)
    for line in f:
        split_line = line.split('\t')
        if split_line[0] in imdb_aka:
            imdb_aka[split_line[0]].append(split_line[2])
        else:
            imdb_aka[split_line[0]] = [split_line[2]]
f.close()

In [39]:
#extract mapping of movielens ID and IMDB
links_movielens_imdb_ori = {}
links_movielens_imdb_converted = {}
links_imdb_movielens_converted = {}

with open("data/Movielens/ml-20m/links.csv") as f:
    reader = csv.reader(f, delimiter=',', quotechar='\"')
    next(reader)
    for row in reader:    
        links_movielens_imdb_ori[row[0]] = row[1]
        
        #since the mapping of imdb id is not 'ready to use', since the format of IMDB id starts with ttxxxxxxxx (with x minimum 8 digit, so if the mapping only have 6 digit, then need to ad 00xxxxxx, ir the mapping only have 7 so add 0xxxxxxx, it can be more than 8 digit)
        if len(str(row[1])) >=7:
            zeros = ''
        else:
            zeros = (7-len(str(row[1]))) * '0'
        converted_imdb_id = 'tt'+ zeros +str(row[1])
        
        links_movielens_imdb_converted[row[0]] = converted_imdb_id
        links_imdb_movielens_converted[converted_imdb_id] = row[0]
        
f.close()

In [44]:
#write tsv file and json file for IMDB mapping
movielens_imdb_mapping_title_aka = {}
with open("Data/TSV_files/movielens_imdb_mapping_title_aka.tsv", 'wt') as out_file:
    tsv_writer = csv.writer(out_file, delimiter='\t')
    tsv_writer.writerow(['movielens_Id', 'movielens_Title', 'imdb_mapped_Id','imdb_Id','imdb_primaryTitle','imdb_originalTitle','imdb_aka'])
    
    for key in links_movielens_imdb_converted:
        imdb_id = links_movielens_imdb_converted[key]
        
        imdb_primaryTitle = imdb_movies_basic.get(imdb_id,{}).get('primaryTitle','N/A')
        imdb_originalTitle = imdb_movies_basic.get(imdb_id,{}).get('originalTitle','N/A')
        imdb_aka_temp = imdb_aka.get(imdb_id,'')

        tsv_writer.writerow([key, movielens_movies[key],links_movielens_imdb_ori[key],links_movielens_imdb_converted[key],imdb_primaryTitle,imdb_originalTitle,imdb_aka_temp])
        movielens_imdb_mapping_title_aka[key]={'movielens_title':movielens_movies[key],
                                               'imdb_mapped_Id' :links_movielens_imdb_ori[key],
                                               'imdb_Id' : links_movielens_imdb_converted[key],
                                               'imdb_primaryTitle' : imdb_primaryTitle,
                                               'imdb_originalTitle': imdb_originalTitle,
                                               'imdb_aka' : imdb_aka_temp
                                              }
out_file.close()

dump_json('movielens_imdb_mapping_title_aka.json',movielens_imdb_mapping_title_aka)
    

In [63]:
#lookup the converted_imdb_id from the imdb_movies_basic
not_available_in_basic = {}
not_available_in_aka = []
for item in links_imdb_movielens_converted:
    if item not in imdb_movies_basic.keys():
        movielensId = links_imdb_movielens_converted[item]
        if item in imdb_aka:
            available_aka = True
        else:
            available_aka = False
        not_available_in_basic[item] = {'movielens_id':movielensId, 'movielens_title':movielens_movies[movielensId],'available_in_aka':available_aka}
    if item not in imdb_aka:
        not_available_in_aka.append(item)

In [77]:
#There are 46 of mapped imdb id that is not found in imdb dataset for various reason, some have updadted number, some are not available, some 
#are not in the basic data but in aka dataset

not_available_in_basic_df = pd.DataFrame(not_available_in_basic).transpose().reset_index()
not_available_in_basic_df.columns = ['Imdb_Id','available_in_aka','movielens_Id','movielens_Title']
not_available_in_basic_df.to_excel("Data/TSV_files/IMDB_ID not found in IMDB datasets.xlsx")
dump_json('Imdb_Id_not_found.json',not_available_in_basic)

# Read Amazon data
To extract the Amazon ASIN and the title

In [52]:
def get_title(string):
    string = string.split(': ')
    if len(string)>4:
        return ': '.join(string[1:-2]).strip()
    else:
        return string[1].strip()

In [58]:
def load_amazon_json(path):
    g = open(path, 'r')
    for l in g:
        yield eval(l)

def write_amazon_data_to_file(file_path, output_path):
    amazon_meta = load_amazon_json(file_path)
    
    #Remove duplicates
    #data = set([json.dumps(d) for d in all_data])
    #unique_data = [json.loads(d) for d in data]
    
    #create dictionary for meta file with asin as the key (this removes duplicates also). Have checked that there are 21927 exact data duplicates (dictionaries).
    #so can do the following to remove duplicates automatically
    unique_amazon_meta = {}
    for item in amazon_meta:
        unique_amazon_meta[item['asin']] = item
            
    amazon_movies = {}
    
    with open(output_path, 'wt') as out_file:
        tsv_writer = csv.writer(out_file, delimiter='\t')
        tsv_writer.writerow(['Amazon_Asin', 'Title'])
        
        for key in unique_amazon_meta:
            #some does not have title
            title_temp = unique_amazon_meta.get(key,{}).get('title','N/A')
            #some title in a form of long script, this is to extract the title
            if title_temp.startswith('var aPageStart'):
                for line in (title_temp.split('\n')):
                    if line.startswith('Amazon.com:'):
                        title_temp = get_title(line)
            
            if len(title_temp)>1000:
                tsv_writer.writerow([key, 'invalid title'])
            else:
                tsv_writer.writerow([key, title_temp])
            
            amazon_movies[key] = title_temp
             
    out_file.close()
    dump_json(output_path.split('/')[-1].split('.')[0]+'.json',amazon_movies)
    dump_json('amazon_unique_meta_2.json',unique_amazon_meta)
    return amazon_movies
    


In [59]:
amazon_movies = write_amazon_data_to_file('Data/Amazon/meta_Movies_and_TV.json','Data/TSV_files/amazon_movie_id_2.tsv')