# File for seeding test data into the MongoDB database

1. Books are taken from the free dataset available on Kaggle and rewritten to MongoDB documents
2. Users are created using some free tool to generate random MongoDB documents and are read from the file
3. Other data are generated based on 1. and 2.

All configuration information (passwords, connecting string etc.) is stored in the config file and is not added to the repository.

In [None]:
import csv 
import json
from bson.objectid import ObjectId

import asyncio
import nest_asyncio
import motor.motor_asyncio

from config import *

nest_asyncio.apply()

In [None]:
client = motor.motor_asyncio.AsyncIOMotorClient(connectionString)
db = client.test

try:
    print(client.server_info())
except Exception:
    print("Unable to connect to the server.")

    db = client['masters']
collection = db.books
collection

In [None]:
# convert a gived csv file into json format with data preprocessing and same to the .json file

def csv_to_json(csvFilePath, jsonFilePath):
    jsonArray = []
    unusedColumns = ['','rating','voters','published_date','page_count']
      
    #read csv file
    with open(csvFilePath, encoding='utf-8') as csvf: 
        #load csv file data using csv library's dictionary reader
        csvReader = csv.DictReader(csvf)   

        #convert each csv row into python dict
        for row in csvReader: 
            row['price'] = round(float(row['price']) * 0.27, 2)
            row['currency'] = 'USD'
            row['generes'] = [element.strip() for element in list(row['generes'].split(","))]
            row['pageCount'] = int(row['page_count'])
            row['publishedDate'] = row['published_date']
            row['imageUrl'] = 'https://mastersimages.blob.core.windows.net/images/book.jpg'

            for column in unusedColumns:
                del row[column]
            
            #add this python dict to json array
            jsonArray.append(row)

    result = list({v['title']:v for v in jsonArray}.values())
    print(len(result))
  
    # convert python jsonArray to JSON String and write to file
    with open(jsonFilePath, 'w', encoding='utf-8') as jsonf: 
        jsonString = json.dumps(result, indent=4)
        jsonf.write(jsonString)
          
csvFilePath = r'google_books_1299.csv'
jsonFilePath = r'books.json'
csv_to_json(csvFilePath, jsonFilePath) # convert data from a given file

In [None]:
# inserting book items into books colection in the database

def readJsonFile(fileName):
    with open(fileName, encoding='utf-8') as file: 
        return json.load(file)

async def do_insert(dataset, fileName):
    data = readJsonFile(fileName)
    result = await db[dataset].insert_many(
        [ i for i in data])

loop = asyncio.get_event_loop()
loop.run_until_complete(do_insert('books', 'books.json'))

In [None]:
# inserting user items into users colection in the database

loop = asyncio.get_event_loop()
loop.run_until_complete(do_insert('users', 'users.json'))

In [None]:
# selection books from the database

async def do_find(dataset):
    result = []
    cursor = db[dataset].find({})
    for document in await cursor.to_list(length=1400):
        result.append(document)
    return result

# selection active users from the database

async def do_find_active_users():
    result = []
    cursor = db['users'].find({
        'isActive': True
    })
    for document in await cursor.to_list(length=1400):
        result.append(document)
    return result

# this data is used in the next blocks to generate "rates" and "orders" collections

In [None]:
import random
import datetime

# rates collection generation and uploading to the database

loop = asyncio.get_event_loop()
books = loop.run_until_complete(do_find('books'))

loop = asyncio.get_event_loop()
users = loop.run_until_complete(do_find_active_users())

rates = []

for i in range(100):
    dict = {}
    dict['bookId'] = books[i]['_id']
    dict_list = []

    for j in range(10):
        usersRandom = random.randint(0, 9)        

        rate = {}
        rate['userId'] = users[usersRandom]['_id']
        rate['rate'] = random.randint(1, 5)
        rate['comment'] = 'Lorem ipsum dolor sit amet..'
        day_diff = random.randint(0, 3)
        yesterday = datetime.datetime.utcnow() - datetime.timedelta(days = day_diff)
        rate['createdAt'] = datetime.datetime(yesterday.year, yesterday.month, yesterday.day)

        existing = False
        
        for item in dict_list:
            if(item['userId'] == rate['userId']):
                existing = True

        if(existing == False):
            dict_list.append(rate)

    dict['reviews'] = dict_list
    rates.append(dict)

async def do_insert_data(data):
    result = await db.rates.insert_many(
        [ i for i in data])
    
loop = asyncio.get_event_loop()
loop.run_until_complete(do_insert_data(rates))

In [None]:
# orders collection generation and uploading to the database

orders = []

for i in range(300):
    booksAmount = random.randint(1, 5)
    usersRandom = random.randint(0, 119)

    dict = {}
    orderBooks = []
    
    dict['userId'] = users[usersRandom]['_id']

    for i in range(booksAmount):
        booksRandom = random.randint(0, 245)
        orderBooks.append(books[booksRandom]['_id'])

    dict['booksId'] = orderBooks

    day_diff = random.randint(0, 100)
    yesterday = datetime.datetime.utcnow() - datetime.timedelta(days = day_diff)
    dict['createdAt'] = datetime.datetime(yesterday.year, yesterday.month, yesterday.day)

    orders.append(dict)

print(len(orders))

async def do_insert_data(data):
    result = await db.orders.insert_many(
        [ i for i in data])
    
loop = asyncio.get_event_loop()
loop.run_until_complete(do_insert_data(orders))

In [None]:
# funtion for deleting uploaded to the database data in case of any error

async def do_delete_many():
    coll = db.rates
    n = await coll.count_documents({})
    print('%s documents before calling delete_many()' % n)
    result = await db.rates.delete_many({'i': {'$ne': ''}})
    print('%s documents after' % (await coll.count_documents({})))

loop = asyncio.get_event_loop()
loop.run_until_complete(do_delete_many())