Based on an eye test, we observed that the original Amazon Kindle Metadata file we were given had many rows with missing data (title, price, description, . This jupyter notebook details the process of how we restored the relevant data

In [None]:
# Necessary libraries
import pandas as pd
import json
import time
import numpy as np
import pymongo
from pymongo import MongoClient

In [None]:
# Retrieve list of asins from original Amazon Kindle Metadata
asin_meta = []
f=open('meta_Kindle_Store.json') # Replace line with location of original Amazon Kindle Metadata
lines = f.readlines()
for line in lines:
    asin_meta.append(line[10:20])

#print(asin_meta)

In [None]:
# 'try3.csv' is scrapped amazon book data we found from kaggle and cleaned to use for processing
# Kaggle URL -> https://www.kaggle.com/ucffool/amazon-sales-rank-data-for-print-and-kindle-books?select=amazon_com_extras.csv
df = pd.read_csv('try3.csv')
df.head()
#print(len(df))

price_ls = np.zeros(len(df)) # Append price = 0 to all rows as price parameter is not available for this dataset
#print(type(price_ls))

description_ls = []
for i in range(len(df)):
    description_ls.append('nil') # Append description = 'nil' to all rows as description parameter is not available for this dataset
#print(description_ls)

# Attach price and description columns to dataframe
df['PRICE'] = price_ls
df['DESCRIPTION'] = description_ls

# Drop unnecessary columns
df.drop(['GROUP', 'FORMAT', 'PUBLISHER'], axis=1, inplace=True)
df.head()

In [None]:
# Transform df into a dictionary to be used for processing later on
title_author_dic = df.set_index('ASIN').T.to_dict('list')
#title_author_dic

In [None]:
# Algorithm to attach relevant information from df to original metadata asin
start_time = time.time()
resultDict = {}
for elem in asin_meta:
    if elem in title_author_dic:
        resultDict[elem] = title_author_dic[elem]
    else:
        resultDict[elem] = False

print(time.time()-start_time)
print(resultDict)

In [None]:
# Determine how many rows of information was added to the metadata asin 
count = 0
for i in resultDict.values():
    if i != False:
        count += 1
print(count)

In [None]:
# 'Kindle_Book_Dataset2.csv' is another scrapped amazon book data we found from kaggle and cleaned to use for processing
# Kaggle URL -> https://www.kaggle.com/snathjr/kindle-books-dataset
df_2 = pd.read_csv('Kindle_Book_Dataset2.csv')
df_2.head()

In [None]:
# For this dataset, we did not have asins as a parameter, we had to pre-process and split the asin from the back of the url parameter
url_ls = df_2['url'].tolist()
url_ls

# Algorithm for splitting url and asin 
asin_ls = []
for i in url_ls:
    asin_ls.append(i.rsplit('/', 1)[1])
asin_ls

# Attach asin back to dataframe
df_2['asin'] = asin_ls
#df_2.head()

# Drop unnecessary columns
df_2.drop(['url', 'save', 'pages', 'size', 'publisher', 'language', 'text_to_speech', 'x_ray', 'lending', 'customer_reviews', 'stars'], axis=1, inplace=True)
df_2.head()

In [None]:
# Transform df into a dictionary to be used for processing later on
title_author_dic_2 = df_2.set_index('asin').T.to_dict('list')
#title_author_dic_2

In [None]:
# Algorithm to attach relevant information from df to original metadata asin
start_time = time.time()

for elem in resultDict:
    if resultDict[elem] == False and elem in title_author_dic_2:
        resultDict[elem] = title_author_dic_2[elem]

print(time.time()-start_time)
print(resultDict)

In [None]:
# Determine how many rows of information was added to the metadata asin 
count = 0
for i in resultDict.values():
    if i != False:
        count += 1
print(count)

In [None]:
# Check results
for i in resultDict:
    if resultDict[i] != False:
        print(resultDict[i])

In [None]:
# Store apended data to file
with open('scrapped_data_with_desc.json','w') as outfile:
    json.dump(resultDict, outfile)

In [None]:
# Algorithm to seperate the rows with full information into a consolidated file
start_time = time.time()
resultDict_2 = {}
for elem in asin_meta:
    if elem in title_author_dic_2:
        resultDict_2[elem] = title_author_dic_2[elem]
    else:
        resultDict_2[elem] = False

print(time.time()-start_time)

In [None]:
# Determine how many rows of information have full parameters
count = 0
for i in resultDict_2.values():
    if i != False:
        count += 1
print(count)

In [None]:
# Save appended results to file
with open('mini_scrapped_12k.json','w') as outfile:
    json.dump(resultDict_2, outfile)

This next part is how we attach the retrieved data to the original metadata file based on the asin parameter

In [None]:
# Open file with scrapped data and asins
f2=open('scrapped_data_with_desc.json')
for line in f2:
    x = line
    #obj = eval(line)
    #print(type(obj))

res = json.loads(x)

In [None]:
# Algorithm to attach back asin with found data back to original metadata
# 'meta_Kindle_Store_Compiled_3.json' is the new file we want to write to
compiled_file = open('meta_Kindle_Store_Compiled_3.json', 'w')

working_data=open('meta_Kindle_Store.json', 'r')
for line in working_data:
    #print(line)
    #print(type(line))
    line_dic = eval(line)
    asin = line_dic['asin']
    if asin in res:
        if res[asin] != False:
            if 'title' not in line_dic.keys():
                line_dic['title'] = res[asin][0]
            if 'price' not in line_dic.keys():
                line_dic['price'] = res[asin][2]
            if 'description' not in line_dic.keys():
                line_dic['description'] = res[asin][3]
            line_dic['author'] = res[asin][1]
        else:
            if 'title' not in line_dic.keys():
                line_dic['title'] = 'nil'
            if 'price' not in line_dic.keys():
                line_dic['price'] = 0.0
            if 'description' not in line_dic.keys():
                line_dic['description'] = 'nil'
            line_dic['author'] = 'nil'
    compiled_file.writelines(json.dumps(line_dic) + '\n') 

working_data.close()
compiled_file.close()

In [None]:
# Open up file with only full asin data
f3=open('mini_scrapped_12k.json')
for line in f3:
    x_2 = line
res_2 = json.loads(x_2)
print(type(res_2))

In [None]:
# Create a seperate json file with only full rows of information
compiled_file_2 = open('meta_Kindle_Store_mini.json', 'w')

working_data=open('meta_Kindle_Store.json', 'r')
for line in working_data:
    #print(line)
    #print(type(line))
    line_dic = eval(line)
    asin = line_dic['asin']
    if asin in res_2:
        if res_2[asin] != False:
            if 'title' not in line_dic.keys():
                line_dic['title'] = res_2[asin][0]
            if 'price' not in line_dic.keys():
                line_dic['price'] = res_2[asin][2]
            if 'description' not in line_dic.keys():
                line_dic['description'] = res_2[asin][3]
            line_dic['author'] = res_2[asin][1]
            compiled_file_2.writelines(json.dumps(line_dic) + '\n') 
        else:
            continue
            #if 'title' not in line_dic.keys():
                #line_dic['title'] = 'nil'
            #if 'price' not in line_dic.keys():
                #line_dic['price'] = 0.0
            #if 'description' not in line_dic.keys():
                #line_dic['description'] = 'nil'
            #line_dic['author'] = 'nil'

working_data.close()
compiled_file_2.close()

Algorithm to upload data into Mongodb server

In [None]:
#import pymongo
#from pymongo import MongoClient

url = "mongodb+srv://jeroee:jerokok97@testdb.cpfwr.mongodb.net/test?authSource=admin&replicaSet=atlas-13ih9s-shard-0&readPreference=primary&appname=MongoDB%20Compass&ssl=true"
client = pymongo.MongoClient(url)  #connecting to mongo atlas
db = client.get_database('testDb') #connecting to database called testDb
meta_Kindle_12k = db.meta_Kindle_12k  


file = open("meta_Kindle_Store_mini.json")
count = 0
for line in file:
    try:
        count+=1
        print(count)
        obj = eval(line)
        meta_Kindle_12k.insert_one(obj)
    except Exception as e:
        print(e)

print("number of object added", count
)
 
#just to check if correct number of entries added