In [8]:
from pymongo import MongoClient
import os
from dotenv import load_dotenv
import certifi

load_dotenv()
mongopwd = os.getenv('MONGOPWD')
mongourl = 'mongodb+srv://saksham:{}@supportservice.3md7h.mongodb.net/myFirstDatabase?retryWrites=true&w=majority'.format(mongopwd)
client = MongoClient(mongourl, tlsCAFile=certifi.where())
db = client['support-service']

In [6]:
import pandas as pd
import time
import json
import datetime

def convert_list_to_string(l):
    val = "["
    for item in l:
        if(isinstance(item, list) or isinstance(item, set) or isinstance(item, tuple)):
            item = convert_list_to_string(list(item))
        elif(isinstance(item, bool) or isinstance(item, float) or isinstance(item, complex) or isinstance(item, int)):
            item = str(item)
        elif(isinstance(item, dict)):
            item = json.dumps(item)
        elif(isinstance(item, datetime.datetime)):
            item = item.strftime("%m/%d/%Y, %H:%M:%S")
        val = val + item
    val = val + ']'
    return val

def dataframe_from_collection(current_collection_name):
    docu = []
    count = 0
    total_len = current_collection_name.count_documents({})
    print("Found", total_len, "documents.")
    for document in current_collection_name.find():
        for key, value in document.items():
            if(key == '_id'):
                document[key] = str(document[key])
            elif(isinstance(document[key], int) or isinstance(document[key], float) or isinstance(document[key], complex)):
                document[key] = str(document[key])
            elif(isinstance(document[key], list)):
                document[key] = convert_list_to_string(document[key])
            elif(isinstance(document[key], bool)):
                document[key] = str(document[key])
        count += 1
        docu.append(document)
        if(count % 10000 == 0):
            print(count, "Documents fetched.................", int(count*100/total_len), "%")
            
    return pd.DataFrame(docu)

def fetch_and_convert_data(database_, fetch_type = "selected", collection_name = []):

    if(fetch_type == 'all'):
        collection_name = database_.list_collection_names()
        
    for curr_collection in collection_name:
        t1 = time.time()
        print("Processing", curr_collection)
        df_collection = dataframe_from_collection(database_[curr_collection])
        print("Converted to Pandas Dataframe. Now, saving to parquet format")
        file_name = "./converted/" + curr_collection + '.parquet'
        try:
            df_collection.to_parquet(file_name)
            print("Saved", file_name)
            t2 = time.time()
            print("Time Taken:", t2-t1, "seconds", end="\n\n")
        except:
            return df_collection

    print('Done!')

In [7]:
result_df = fetch_and_convert_data(db, fetch_type="all")

Processing support_list
Found 21 documents.
Converted to Pandas Dataframe. Now, saving to parquet format
Saved ./converted/support_list.parquet
Time Taken: 2.6323611736297607 seconds

Processing webhook_error_logs
Found 0 documents.
Converted to Pandas Dataframe. Now, saving to parquet format
Saved ./converted/webhook_error_logs.parquet
Time Taken: 0.08346080780029297 seconds

Processing support_tickets
Found 680042 documents.
10000 Documents fetched................. 1 %
20000 Documents fetched................. 2 %
30000 Documents fetched................. 4 %
40000 Documents fetched................. 5 %
50000 Documents fetched................. 7 %
60000 Documents fetched................. 8 %
70000 Documents fetched................. 10 %
80000 Documents fetched................. 11 %
90000 Documents fetched................. 13 %
100000 Documents fetched................. 14 %
110000 Documents fetched................. 16 %
120000 Documents fetched................. 17 %
130000 Documents fet

  return pd.DataFrame(docu)


Converted to Pandas Dataframe. Now, saving to parquet format
Saved ./converted/support_tickets.parquet
Time Taken: 1384.187625169754 seconds

Processing support_tickets_rating
Found 21091 documents.
10000 Documents fetched................. 47 %
20000 Documents fetched................. 94 %
Converted to Pandas Dataframe. Now, saving to parquet format
Saved ./converted/support_tickets_rating.parquet
Time Taken: 24.923013925552368 seconds

Processing leader_kyc
Found 0 documents.
Converted to Pandas Dataframe. Now, saving to parquet format
Saved ./converted/leader_kyc.parquet
Time Taken: 0.09340429306030273 seconds

Processing support_form_items
Found 6 documents.
Converted to Pandas Dataframe. Now, saving to parquet format
Saved ./converted/support_form_items.parquet
Time Taken: 0.08456206321716309 seconds

Processing support_items
Found 94 documents.
Converted to Pandas Dataframe. Now, saving to parquet format
Saved ./converted/support_items.parquet
Time Taken: 0.18226265907287598 secon

1530000 Documents fetched................. 85 %
1540000 Documents fetched................. 86 %
1550000 Documents fetched................. 86 %
1560000 Documents fetched................. 87 %
1570000 Documents fetched................. 87 %
1580000 Documents fetched................. 88 %
1590000 Documents fetched................. 89 %
1600000 Documents fetched................. 89 %
1610000 Documents fetched................. 90 %
1620000 Documents fetched................. 90 %
1630000 Documents fetched................. 91 %
1640000 Documents fetched................. 91 %
1650000 Documents fetched................. 92 %
1660000 Documents fetched................. 93 %
1670000 Documents fetched................. 93 %
1680000 Documents fetched................. 94 %
1690000 Documents fetched................. 94 %
1700000 Documents fetched................. 95 %
1710000 Documents fetched................. 95 %
1720000 Documents fetched................. 96 %
1730000 Documents fetched...............