# Extracting Reviews from Amazon

In [53]:
import json
import pandas as pd
import string

In [54]:
def load_amazon_json(path):
    g = open(path, 'r')
    for l in g:
        yield json.loads(l)

def load_json(filename):
    '''
    To load Json files
    :param filename : filename to load
    :return the object from Json file
    '''
    
    with open(filename) as f:
        data = json.load(f)
    
    return data

def dump_json(filename,obj_to_dump):
    '''
    To dump (mainly) dictionaries to Json for further processing
    :param filename : filename to save the jsonfile
    '''
    
    with open(filename, 'w') as fp:
        json.dump(obj_to_dump, fp)
    fp.close()


In [55]:
#simple preprocessing the reviews
def preprocess(text):
    list_punctuation = ["!",'"','#','.',',','?']
    text = text.replace('\n',' ').strip()
    for punct in list_punctuation:
        text = text.replace(punct,' ')
    text = ' '.join([x for x in text.split() if x!=''])
    return(text)
    

## Loading the reviews

In [56]:
reviews = load_amazon_json('Data/Amazon/Movies_and_TV.json')

In [57]:
# Load all items from the json into dictionary, key is running number 0 - n
dictionary_all_reviews = {}
for i,item in enumerate(reviews):
    dictionary_all_reviews[i] = item
    

In [68]:
#to save the reviews in dictionary for each asin as key, and list of reviews as the values, only for reviews with length > 5
dictionary_reviews = {}
for item in dictionary_all_reviews:
    review_text = dictionary_all_reviews.get(item,{}).get('reviewText','')
    review_text = preprocess(review_text)
    if (review_text != '') & (len(review_text.split())>5):
        if dictionary_all_reviews[item]['asin'] not in dictionary_reviews:
            dictionary_reviews[dictionary_all_reviews[item]['asin']] = [review_text]
        else:
            dictionary_reviews[dictionary_all_reviews[item]['asin']].append(review_text)

In [69]:
len(dictionary_reviews)

175907

In [70]:
#Get the total number of unique reviews for each ASIN (just for analysis)
total_number_reviews = {}
total_unique_reviews_per_asin = {}
count_total_reviews = 0
count_total_reviews_unique = 0
for key in dictionary_reviews:
    total_len=len(dictionary_reviews[key])
    total_unique_len = len(list(dict.fromkeys(dictionary_reviews[key])))
    total_number_reviews[key] = total_len
    count_total_reviews = count_total_reviews + total_len
    total_unique_reviews_per_asin[key] = total_unique_len
    count_total_reviews_unique = count_total_reviews_unique + total_unique_len

In [74]:
count_total_reviews_unique

6572991

## Load Mapped movielens

In [75]:
#Load mapped movielens id
mapped_movielens_amazon=load_json('Data/mapped_ml_azn_4_10.json')

In [76]:
mapped_mlId_asin = {} # dictionary with key:movielens id, value: list of asins
reverse_mapped = {} # dictionary with key : tuple of asins, value: list of movielens ID (there asisns which are mapped to more than 1 movielens ID)
for key in mapped_movielens_amazon:
    asin_matched = [item[0] for item in mapped_movielens_amazon[key]['matched']]
    #for item in mapped_movielens_amazon[key]['matched']:
    mapped_mlId_asin[key] = asin_matched    
    reverse_key = tuple(asin_matched)
    if reverse_key in reverse_mapped:
        reverse_mapped[reverse_key].append(key)
    else:
        reverse_mapped[reverse_key] = [key]    

In [78]:
# Create directory with key : tuple of movielens_id, value : list of asins which mapped to movielens Ids
# Due to the duplicates in title in amazon and movielens, there are ASINs which are mapped to more than one movielens ID
multi_movielens_key_mapped = {}
for asins in reverse_mapped:
    key = tuple(reverse_mapped[asins])
    multi_movielens_key_mapped[key]=asins

## Tags Loading

In [79]:
tags_df = pd.read_csv('Data/ml-20m/tags.csv')[['movieId','tag']]
tags_df.dropna(inplace=True) # There are 16 tags with NAN values, drop the rows
tags_df['tag'] = [str(x).lower() for x in tags_df['tag']] 
tags_df.drop_duplicates(inplace=True) #Delete duplicate pair(movielens id, tag)

In [80]:
#to get list of tags for each movielens id, in a form of dictionary with key: single movielens_Id, value: list of tags
movieId_tag_dictionary = {}
for ml_id, tag in zip(list(tags_df['movieId']),list(tags_df['tag'])):
    ml_id = str(ml_id)
    if ml_id not in movieId_tag_dictionary:
        movieId_tag_dictionary[ml_id] = [tag]
    else:
        movieId_tag_dictionary[ml_id].append(tag)
    

In [81]:
# To make dictionary with key : tuple of movielens id, value: list of tags from each movielens id in the key, 
# the tuple of movielens id taken from the multi_movielens_key_mapped
multi_movie_id_tag_dictionary = {}
for key in multi_movielens_key_mapped:
    tags_temp = []
    for movie_id in key:
        tags_temp = tags_temp + movieId_tag_dictionary.get(movie_id,[])

    if tags_temp!=[]:
        multi_movie_id_tag_dictionary[key] = list(set(tags_temp)) #make unique list of tags 

## Write Reviews to Text files

In [94]:
# write data in one file (only movielens id that have tags assigned to it)
# since there are limitation to review lenght, there are some asins with no review

file2 = open("Data/reviews_txt_files/testing","w")
file3 = open("Data/reviews_txt_files/testing_example","w") # This is just for example, for analysing format, etc. with smaller file

for i,ml_Id in enumerate(multi_movielens_key_mapped):
    if ml_Id in multi_movie_id_tag_dictionary:
        asins = multi_movielens_key_mapped[ml_Id]
        for each_asin in asins:
            for review in list(dict.fromkeys(dictionary_reviews.get(each_asin,[]))):
                review = review.lower().replace('\n','')
                file2.write(','.join(ml_Id)+'\t'+review) 
                file2.write( "\n")
                
                if i < 6:
                    file3.write(','.join(ml_Id)+'\t'+review) 
                    file3.write( "\n")
    
file2.close()
file3.close()

In [92]:
len(set(movielens_id_in_idx))

12791

## Indexing Elastic Search

In [49]:
from elasticsearch import Elasticsearch

es = Elasticsearch()
#es.indices.delete(index='movie_reviews_4_10_5_no_stemming_shingle')

In [51]:
# Initialize index

INDEX_NAME = "movie_reviews_4_10_5_no_stemming_shingle_max3"


#index setting using english analyzer (stemming included)
INDEX_SETTINGS = {
    "settings" : {
        "index" : {
            "number_of_shards" : 1,
            "number_of_replicas" : 1
        }
        
    },
    "mappings": {
            "properties": {
                "review": {
                    "type": "text",
                    "term_vector": "with_positions",
                    "analyzer": "english"
                },
                "tags": {
                    "type": "keyword"
                },
                "movielens_ids": {
                    "type": "keyword"
                },

            }
        }
    }


#index setting if not including stemming
INDEX_SETTINGS_NO_STEMMING = {
    "settings" : {
        "index" : {
            "number_of_shards" : 1,
            "number_of_replicas" : 1
        },
        
        
        "analysis": {
              "analyzer": {
                "my_english_analyzer": {
                  "type": "standard",
                  "stopwords": "_english_"
                }
            }
        }
        
    },
    "mappings": {
            "properties": {
                "review": {
                    "type": "text",
                    "term_vector": "with_positions",
                    "analyzer": "my_english_analyzer"
                },
                "tags": {
                    "type": "keyword"
                },
                "movielens_ids": {
                    "type": "keyword"
                },

            }
        }
    }

#Index setting with shingles
INDEX_MAPPING = 
{
    "settings" : {
        "index" : {
            "number_of_shards" : 1,
            "number_of_replicas" : 1
        },
        "analysis": {
            "analyzer": {
                "my_english_analyzer": {
                  "type": "custom",
                  "tokenizer": "standard",  
                  "stopwords": "_english_",
                  "filter" : ["shingle-filter","lowercase","stop"]  
                    }
                },
             "filter":{
                "shingle-filter":{
                "type":"shingle",
                "min_shingle_size":2,
                "max_shingle_size":4,
                "output_unigrams":True
                }
            }      
        }     
    },
    "mappings": {
        "properties": {
            "review": {
                "type": "text",
                "term_vector": "with_positions",
                "analyzer": "my_english_analyzer"
            },
             "tags": {
                "type": "keyword"
            },
            "movielens_ids": {
                "type": "keyword"
            },
        }
    }
}




if not es.indices.exists(INDEX_NAME):  # create index if it doesn't exist
    es.indices.create(index=INDEX_NAME, body=INDEX_SETTINGS_NO_STEMMING_SHINGLE )
    print('Index created')
    


In [None]:
# Build index, by using the sorted_unique movie_reviews text file
file1 = open('Data/reviews_txt_files/movie_reviews_sorted_unique_4_10_5.txt', 'r') 
Lines = file1.readlines() 
number_of_lines = len(Lines) 
#count = 0
# Strips the newline character 
for i,line in enumerate(Lines): 
    line = line.split('\t')
    
    movielens_id = line[0].split(',')
    review = line[1].strip()
    tags = multi_movie_id_tag_dictionary[tuple(movielens_id)]
    es.index(index=INDEX_NAME, id=i, body={'movielens_ids': movielens_id, 'review': review, 'tags': tags})
    
    if ((i+1)%1000 == 0):
        print('Done indexing for ' + str(i+1)  + ' lines out of ' + str(number_of_lines))


Index name description:
- movie_reviews_4_10_2 : with english analyzer(with stemming) limit word length > 2
- movie_reviews_4_10_2_no_stemming : without stemming limit word length > 2
- movie_reviews_4_10_3 : with english analyzer(with stemming) limit word length > 3 (not finished)
- movie_reviews_4_10_5_no_stemming : without stemming and limit to word length > 5
- movie_reviews_4_10_5 : with english analyzer(with stemming) limit word length > 5
- movie_reviews_4_10_5_no_stemming_shingle : adding shingle (min 2, max 2) (with unigrams)
- movie_reviews_4_10_5_no_stemming_shingle_max4 : adding shingle (min2, max4) (with unigrams)
- movie_reviews_4_10_5_no_stemming_shingle_max3 : adding shingle (min2, max3) (with unigrams)
- movie_reviews_4_10_5_no_stemming_shingle_only4 : only shingle 4 (without unigrams)
- movie_reviews_4_10_5_no_stemming_shingle_only3 : only shingle 3 (without unigrams)
- movie_reviews_4_10_5_no_stemming_shingle_only3 : only shingle 2 (without unigrams)

file name description:
- Data/reviews_txt_files/movie_reviews_sorted_unique_4_10_5.txt (length review > 5)
