In [126]:
# Importing the Libraries
import csv
import json
import pandas as pd
import os
import sys
from pymongo import MongoClient
from datetime import datetime, timedelta

## Job postings merged dataset creation and input into mongodb

In [216]:
# Input files related to job postings, all linked by job_id
file_path_str = 'D:\\Veena\\SJSU-Classes\\Sem1\\DatabaseSystemsforAnalytics-225-24\\Lab2\\Output_files_lab1\\'

# Reading the csv files and exlcuding the unnecessary columns
job_skills_df = pd.read_csv(file_path_str+"job_skills_clean.csv", 
                           usecols=lambda x: x != "Unnamed: 0")
job_industries_df = pd.read_csv(file_path_str+"job_industries_clean.csv", 
                           usecols=lambda x: x != "Unnamed: 0")
benefits_df = pd.read_csv(file_path_str+"benefits_clean.csv", 
                           usecols=lambda x: x != "Unnamed: 0")
job_postings_df = pd.read_csv(file_path_str+"job_postings_clean.csv", 
                           usecols=lambda x: x != "Unnamed: 0")

In [169]:
job_skills_df.head()

Unnamed: 0,job_id,skill_abr
0,3690843087,ACCT
1,3690843087,FIN
2,3691763971,MGMT
3,3691763971,MNFC
4,3691775263,MGMT


In [170]:
job_industries_df.head()

Unnamed: 0,job_id,industry_id
0,3378133231,68
1,3497509795,96
2,3690843087,47
3,3691775263,112
4,3691779379,80


In [171]:
benefits_df.head()

Unnamed: 0,job_id,inferred,type
0,3690843087,0,Medical insurance
1,3690843087,0,Dental insurance
2,3690843087,0,401k
3,3690843087,0,Paid maternity leave
4,3690843087,0,Disability insurance


In [172]:
job_postings_df.head()

Unnamed: 0,job_id,company_id,title,description,max_salary,med_salary,min_salary,pay_period,formatted_work_type,location,...,skills_desc,listed_time,posting_domain,sponsored,work_type,currency,compensation_type,listed_time_ts,expiry_ts,closed_time_ts
0,133114754,77766802.0,Sales Manager,Are you a dynamic and creative marketing profe...,,,,,Fulltime,Santa Clarita CA,...,,1690000000000.0,,0,FULL_TIME,,,2023-07-22 04:26:40,2023-11-14 22:13:20,
1,133196985,1089558.0,Model Risk Auditor,Join Us as a Model Risk Auditor Showcase Your...,,,,,Contract,New York NY,...,,1690000000000.0,,0,CONTRACT,,,2023-07-22 04:26:40,2023-11-14 22:13:20,
2,381055942,96654609.0,Business Manager,Business ManagerFirst Baptist Church ForneyFor...,,,,,Fulltime,Forney TX,...,,1690000000000.0,,0,FULL_TIME,,,2023-07-22 04:26:40,2023-11-14 22:13:20,
3,529257371,1244539.0,NY Studio Assistant,YOU COULD BE ONE OF THE MAGIC MAKERS\nKen Fulk...,,,,,Fulltime,New York NY,...,,1690000000000.0,,1,FULL_TIME,,,2023-07-22 04:26:40,2024-03-09 16:00:00,
4,903408693,3894635.0,Office Associate,Provide clerical and administrative support to...,42000.0,,37000.0,YEARLY,Fulltime,Albany GA,...,,1690000000000.0,,1,FULL_TIME,USD,BASE_SALARY,2023-07-22 04:26:40,2024-03-09 16:00:00,


In [173]:
def count_duplicates_per_column(df):
    duplicate_counts = df.apply(lambda col: col.duplicated().sum())
    return duplicate_counts

In [174]:
count_duplicates_per_column(job_postings_df)

job_id                            0
company_id                     9490
title                          4792
description                    2032
max_salary                    14209
med_salary                    15206
min_salary                    14292
pay_period                    15516
formatted_work_type           15513
location                      12562
applies                       15235
original_listed_time          15519
remote_allowed                15518
views                         14808
job_posting_url                   0
application_url                6634
application_type              15517
expiry                        15517
closed_time                   15518
formatted_experience_level    15513
skills_desc                   15378
listed_time                   15519
posting_domain                14223
sponsored                     15518
work_type                     15513
currency                      15518
compensation_type             15518
listed_time_ts              

In [175]:
count_duplicates_per_column(benefits_df)

job_id       8265
inferred    13759
type        13749
dtype: int64

In [176]:
count_duplicates_per_column(job_industries_df)

job_id          6033
industry_id    21791
dtype: int64

In [177]:
count_duplicates_per_column(job_skills_df)

job_id       12262
skill_abr    27864
dtype: int64

In [217]:
# Grouping benefits df by job_id and converting benefits (type column) into a list
benefits_grouped_df = benefits_df.groupby('job_id').agg({'type': lambda x: list(x)}).reset_index()
# Grouping benefits df by job_id and converting inferred into a list
inferred_grouped_df = benefits_df.groupby('job_id').agg({'inferred': lambda x: list(x)}).reset_index()

In [179]:
inferred_grouped_df.head()

Unnamed: 0,job_id,inferred
0,85008768,"[0, 0, 0, 0, 0, 0]"
1,133114754,"[0, 0, 0]"
2,529257371,"[1, 1, 1]"
3,967848246,[1]
4,1483357774,"[0, 0, 0]"


In [181]:
count_duplicates_per_column(inferred_grouped_df)

job_id         0
inferred    5467
dtype: int64

In [180]:
benefits_grouped_df.head()

Unnamed: 0,job_id,type
0,85008768,"[Medical insurance, Vision insurance, Dental i..."
1,133114754,"[Medical insurance, 401k, Vision insurance]"
2,529257371,"[Medical insurance, Vision insurance, Dental i..."
3,967848246,[401k]
4,1483357774,"[Medical insurance, Vision insurance, Dental i..."


In [182]:
count_duplicates_per_column(benefits_grouped_df)

job_id       0
type      5025
dtype: int64

In [218]:
# Combining the above created dataframes into 1
benefits_group_combined_df = benefits_grouped_df.merge(inferred_grouped_df,
                      on='job_id',
                      how='left')

# Renaming the column 'type' to 'benefits' for better understanding
benefits_group_combined_df.rename(columns = {'type':'benefits'}, inplace = True)
benefits_group_combined_df.head()

Unnamed: 0,job_id,benefits,inferred
0,85008768,"[Medical insurance, Vision insurance, Dental i...","[0, 0, 0, 0, 0, 0]"
1,133114754,"[Medical insurance, 401k, Vision insurance]","[0, 0, 0]"
2,529257371,"[Medical insurance, Vision insurance, Dental i...","[1, 1, 1]"
3,967848246,[401k],[1]
4,1483357774,"[Medical insurance, Vision insurance, Dental i...","[0, 0, 0]"


In [184]:
benefits_group_combined_df.shape

(5496, 3)

In [185]:
count_duplicates_per_column(benefits_group_combined_df)

job_id         0
benefits    5025
inferred    5467
dtype: int64

In [219]:
# Grouping industries df by job_id and converting industry_id column into a list
job_industries_grouped_df = job_industries_df.groupby('job_id').agg({'industry_id': lambda x: list(x)}).reset_index()
job_industries_grouped_df.head()

Unnamed: 0,job_id,industry_id
0,85008768,[42]
1,133114754,[92]
2,133196985,[104]
3,381055942,[89]
4,529257371,[99]


In [188]:
count_duplicates_per_column(job_industries_grouped_df)

job_id             0
industry_id    14535
dtype: int64

In [220]:
# Grouping skills df by job_id and converting skill_abr column into a list
job_skills_grouped_df= job_skills_df.groupby('job_id').agg({'skill_abr': lambda x: list(x)}).reset_index()
job_skills_grouped_df.head()

Unnamed: 0,job_id,skill_abr
0,85008768,"[SALE, BD]"
1,133114754,"[SALE, BD]"
2,133196985,"[ACCT, FIN]"
3,529257371,"[DSGN, ART, IT]"
4,903408693,[ADM]


In [189]:
count_duplicates_per_column(job_skills_grouped_df)

job_id           0
skill_abr    14477
dtype: int64

In [221]:
# Deduplicating job_postings dataframe
job_postings_df = job_postings_df.drop_duplicates()

In [222]:
# Merging all datasets with job_postings being the left table
merged_job_postings_df = job_postings_df.merge(benefits_group_combined_df,
                      on='job_id',
                      how='left').merge(job_industries_grouped_df,
                                        on='job_id',
                                        how='left').merge(job_skills_grouped_df,
                                                          on='job_id',
                                                          how='left')

In [223]:
# Checking to make sure data is appearing correctly
merged_job_postings_df.head()

Unnamed: 0,job_id,company_id,title,description,max_salary,med_salary,min_salary,pay_period,formatted_work_type,location,...,work_type,currency,compensation_type,listed_time_ts,expiry_ts,closed_time_ts,benefits,inferred,industry_id,skill_abr
0,133114754,77766802.0,Sales Manager,Are you a dynamic and creative marketing profe...,,,,,Fulltime,Santa Clarita CA,...,FULL_TIME,,,2023-07-22 04:26:40,2023-11-14 22:13:20,,"[Medical insurance, 401k, Vision insurance]","[0, 0, 0]",[92],"[SALE, BD]"
1,133196985,1089558.0,Model Risk Auditor,Join Us as a Model Risk Auditor Showcase Your...,,,,,Contract,New York NY,...,CONTRACT,,,2023-07-22 04:26:40,2023-11-14 22:13:20,,,,[104],"[ACCT, FIN]"
2,381055942,96654609.0,Business Manager,Business ManagerFirst Baptist Church ForneyFor...,,,,,Fulltime,Forney TX,...,FULL_TIME,,,2023-07-22 04:26:40,2023-11-14 22:13:20,,,,[89],
3,529257371,1244539.0,NY Studio Assistant,YOU COULD BE ONE OF THE MAGIC MAKERS\nKen Fulk...,,,,,Fulltime,New York NY,...,FULL_TIME,,,2023-07-22 04:26:40,2024-03-09 16:00:00,,"[Medical insurance, Vision insurance, Dental i...","[1, 1, 1]",[99],"[DSGN, ART, IT]"
4,903408693,3894635.0,Office Associate,Provide clerical and administrative support to...,42000.0,,37000.0,YEARLY,Fulltime,Albany GA,...,FULL_TIME,USD,BASE_SALARY,2023-07-22 04:26:40,2024-03-09 16:00:00,,,,"[27, 34]",[ADM]


In [194]:
# Checking size of the data
merged_job_postings_df.shape

(15520, 34)

In [195]:
count_duplicates_per_column(merged_job_postings_df)

job_id                            0
company_id                     9490
title                          4792
description                    2032
max_salary                    14209
med_salary                    15206
min_salary                    14292
pay_period                    15516
formatted_work_type           15513
location                      12562
applies                       15235
original_listed_time          15519
remote_allowed                15518
views                         14808
job_posting_url                   0
application_url                6634
application_type              15517
expiry                        15517
closed_time                   15518
formatted_experience_level    15513
skills_desc                   15378
listed_time                   15519
posting_domain                14223
sponsored                     15518
work_type                     15513
currency                      15518
compensation_type             15518
listed_time                 

In [225]:
# Writing out the merged dataframe to a csv file
merged_job_postings_df.to_csv(r'D:\Veena\SJSU-Classes\Sem1\DatabaseSystemsforAnalytics-225-24\Lab2\OutputCSVFiles\merged_job_postings.csv', sep=',', index=False,header=True)

In [224]:
# Establishing connection to MongoDB instance to input merged file
mongodb_client = MongoClient('mongodb://localhost:27017') # connection string

# Connecting the created database
db_ref = mongodb_client['LinkedIn_job_postings_DB']

# Connect to created empty collection
collection_job = db_ref['job_postings']

# Convert merged dataframe to a list of dictionaries
job_merge_dict = merged_job_postings_df.to_dict(orient="records")

# Iterate over each document to convert dictionaries to JSON arrays
# This is to ensure that the list fields type, industry_id and skill_abr are arrays
for doc in job_merge_dict:
    # Iterate over each field in the document
    for field, value in doc.items():
        # If the value is a dictionary, convert it to list
        if isinstance(value, dict):
            doc[field] = list(value.values())

    # Insert the document into the collection
    collection_job.insert_one(doc)

In [229]:
# Connection string for connecting to cloud instance of mongoDB using atlas
mongodb_client_atlas = MongoClient('mongodb+srv://veenabeknal:<password>@linkedinjobpostingsdbcl.59k7czo.mongodb.net/')

# Connecting the created database to cloud instance of mongoDB using atlas
db_ref_atlas = mongodb_client_atlas['LinkedIn_job_postings_DB']

# Connect to created empty collection to cloud instance of mongoDB using atlas
collection_job_atlas = db_ref_atlas['job_postings']

# Convert merged dataframe to a list of dictionaries
job_merge_dict = merged_job_postings_df.to_dict(orient="records")

# Iterate over each document to convert dictionaries to JSON arrays
# This is to ensure that the list fields type, industry_id and skill_abr are arrays
for doc in job_merge_dict:
    # Iterate over each field in the document
    for field, value in doc.items():
        # If the value is a dictionary, convert it to list
        if isinstance(value, dict):
            doc[field] = list(value.values())

    # Insert the document into the collection to mongoDB using atlas
    collection_job_atlas.insert_one(doc)

In [226]:
# Alternate approach: reading in merged csv and convert to JSON
job_merge_df = pd.read_csv (r'D:\Veena\SJSU-Classes\Sem1\DatabaseSystemsforAnalytics-225-24\Lab2\OutputCSVFiles\merged_job_postings.csv')
job_merge_df.to_json (r'D:\Veena\SJSU-Classes\Sem1\DatabaseSystemsforAnalytics-225-24\Lab2\JsonOutputFiles\merged_job_postings.json',orient ='records')

## Companies merged dataset creation and input into mongodb

In [197]:
# Input files related to job postings, all linked by job_id
file_path_str = 'D:\\Veena\\SJSU-Classes\\Sem1\\DatabaseSystemsforAnalytics-225-24\\Lab2\\Output_files_lab1\\'

# Reading the csv files and excluding the unnecessary columns
companies_df = pd.read_csv(file_path_str+"companies_clean.csv", 
                           usecols=lambda x: x != "Unnamed: 0")
company_industries_df = pd.read_csv(file_path_str+"company_industries_clean.csv", 
                                    usecols=lambda x: x != "Unnamed: 0")
company_specialities_df = pd.read_csv(file_path_str+"company_specialities_clean.csv", 
                                      usecols=lambda x: x != "Unnamed: 0")

In [198]:
companies_df.head()

Unnamed: 0,company_id,name,description,company_size,state,country,city,zip_code,address,url
0,1009,IBM,At IBM we do more than work We create We creat...,7.0,NY,US,Armonk New York,10504,International Business Machines Corp,https://www.linkedin.com/company/ibm
1,1016,GE HealthCare,Every day millions of people feel the impact o...,7.0,Not Available,US,Chicago,0,,https://www.linkedin.com/company/gehealthcare
2,1021,GE Power,GE Power part of GE Vernova is a world energy ...,7.0,NY,US,Schenectady,12345,1 River Road,https://www.linkedin.com/company/gepower
3,1025,Hewlett Packard Enterprise,Official LinkedIn of Hewlett Packard Enterpris...,7.0,Texas,US,Houston,77389,1701 E Mossy Oaks Rd Spring,https://www.linkedin.com/company/hewlett-packa...
4,1028,Oracle,Were a cloud technology company that provides ...,7.0,Texas,US,Austin,78741,2300 Oracle Way,https://www.linkedin.com/company/oracle


In [199]:
count_duplicates_per_column(companies_df)

company_id         0
name              37
description       98
company_size    6055
state           5743
country         6013
city            4402
zip_code        3167
address         1008
url               33
dtype: int64

In [200]:
company_industries_df.head()

Unnamed: 0,company_id,industry
0,81149246,Higher Education
1,10033339,Information Technology Services
2,6049228,Accounting
3,2641066,Electrical Electronic Manufacturing
4,96649998,Marketing Advertising


In [201]:
count_duplicates_per_column(company_industries_df)

company_id       0
industry      5862
dtype: int64

In [202]:
# Grouping industries df by company_id and converting industry into a list
company_industries_grouped_df = company_industries_df.groupby('company_id').agg({'industry': lambda x: list(x)}).reset_index()
company_industries_grouped_df.head()

Unnamed: 0,company_id,industry
0,1009,[Information Technology Services]
1,1016,[Hospital Health Care]
2,1021,[Renewables Environment]
3,1025,[Information Technology Services]
4,1028,[Information Technology Services]


In [203]:
company_specialities_df.head()

Unnamed: 0,company_id,speciality
0,81149246,Childrens Music Education
1,81149246,Foundational Music Theory
2,81149246,Child Music Lessons
3,81149246,social emotional learning
4,81149246,social emotional development


In [204]:
count_duplicates_per_column(company_specialities_df)

company_id    38118
speciality    17882
dtype: int64

In [205]:
# Grouping specialities df by company_id and converting specialities into a list
company_specialities_grouped_df = company_specialities_df.groupby('company_id').agg({'speciality': lambda x: list(x)}).reset_index()
company_specialities_grouped_df.head()

Unnamed: 0,company_id,speciality
0,1009,"[Cloud, Mobile, Cognitive, Security, Research,..."
1,1016,"[Healthcare, Biotechnology]"
2,1021,"[Distributed Power, Gasification, Generators, ..."
3,1028,"[enterprise, software, applications, database,..."
4,1038,"[Audit, Consulting, Financial Advisory, Risk M..."


In [206]:
# Deduplicating companies dataframe
companies_df = companies_df.drop_duplicates()

In [207]:
# Merging all datasets with companies_df being the left table
merged_companies_df = companies_df.merge(company_industries_grouped_df,
                      on='company_id',
                      how='left').merge(company_specialities_grouped_df,
                                        on='company_id',
                                        how='left')
# Renaming the column name to company_name for better understanding
merged_companies_df.rename(columns = {'name':'company_name'}, inplace = True)

In [208]:
# Checking to make sure data is appearing correctly
merged_companies_df.head()

Unnamed: 0,company_id,company_name,description,company_size,state,country,city,zip_code,address,url,industry,speciality
0,1009,IBM,At IBM we do more than work We create We creat...,7.0,NY,US,Armonk New York,10504,International Business Machines Corp,https://www.linkedin.com/company/ibm,[Information Technology Services],"[Cloud, Mobile, Cognitive, Security, Research,..."
1,1016,GE HealthCare,Every day millions of people feel the impact o...,7.0,Not Available,US,Chicago,0,,https://www.linkedin.com/company/gehealthcare,[Hospital Health Care],"[Healthcare, Biotechnology]"
2,1021,GE Power,GE Power part of GE Vernova is a world energy ...,7.0,NY,US,Schenectady,12345,1 River Road,https://www.linkedin.com/company/gepower,[Renewables Environment],"[Distributed Power, Gasification, Generators, ..."
3,1025,Hewlett Packard Enterprise,Official LinkedIn of Hewlett Packard Enterpris...,7.0,Texas,US,Houston,77389,1701 E Mossy Oaks Rd Spring,https://www.linkedin.com/company/hewlett-packa...,[Information Technology Services],
4,1028,Oracle,Were a cloud technology company that provides ...,7.0,Texas,US,Austin,78741,2300 Oracle Way,https://www.linkedin.com/company/oracle,[Information Technology Services],"[enterprise, software, applications, database,..."


In [209]:
# Checking size of the data
merged_companies_df.shape

(6063, 12)

In [227]:
# Writing out the merged dataframe to a csv file
merged_companies_df.to_csv(r'D:\Veena\SJSU-Classes\Sem1\DatabaseSystemsforAnalytics-225-24\Lab2\OutputCSVFiles\merged_companies.csv', sep=',', index=False,header=True)

In [210]:
# Connect to created empty collection using the previously established connection
collection_companies = db_ref['companies']

# Convert merged dataframe to a list of dictionaries
companies_merge_dict = merged_companies_df.to_dict(orient="records")

# Iterate over each document to convert dictionaries to JSON arrays
# This is to ensure that the list fields specialities and industry are arrays
for doc in companies_merge_dict:
    # Iterate over each field in the document
    for field, value in doc.items():
        # If the value is a dictionary, convert it to list
        if isinstance(value, dict):
            doc[field] = list(value.values())

    # Insert the document into the collection
    collection_companies.insert_one(doc)

In [230]:
# Connect to created empty collection using the previously established connection to cloud instance of mongoDB
collection_companies_atlas = db_ref_atlas['companies']

# Convert merged dataframe to a list of dictionaries
companies_merge_dict = merged_companies_df.to_dict(orient="records")

# Iterate over each document to convert dictionaries to JSON arrays
# This is to ensure that the list fields specialities and industry are arrays
for doc in companies_merge_dict:
    # Iterate over each field in the document
    for field, value in doc.items():
        # If the value is a dictionary, convert it to list
        if isinstance(value, dict):
            doc[field] = list(value.values())

    # Insert the document into the collection to cloud instance of mongoDB
    collection_companies_atlas.insert_one(doc)

## Employee counts dataframe will be standalone

In [211]:
file_path_str = 'D:\\Veena\\SJSU-Classes\\Sem1\\DatabaseSystemsforAnalytics-225-24\\Lab2\\Output_files_lab1\\'

# Reading the employee count csv file and exlcuding the unnecessary columns
employee_counts_df = pd.read_csv(file_path_str+"employee_counts_clean.csv", 
                           usecols=lambda x: x != "Unnamed: 0")

In [212]:
# Creating date column to make it easier to query in mongodb
employee_counts_df['time_recorded_ts'] = pd.to_datetime(employee_counts_df.time_recorded * 1e9)
employee_counts_df['date_recorded'] = employee_counts_df['time_recorded_ts'].dt.strftime('%Y-%m-%d')

In [213]:
employee_counts_df.head()

Unnamed: 0,company_id,employee_count,follower_count,time_recorded,time_recorded_ts,date_recorded
0,81149246,6,91,1692645000.0,2023-08-21 19:04:04.277973504,2023-08-21
1,10033339,3,187,1692645000.0,2023-08-21 19:04:04.277973504,2023-08-21
2,6049228,20,82,1692645000.0,2023-08-21 19:04:05.101318400,2023-08-21
3,2641066,45,2336,1692645000.0,2023-08-21 19:04:05.923216640,2023-08-21
4,96649998,0,2,1692645000.0,2023-08-21 19:04:05.924218880,2023-08-21


In [214]:
employee_counts_df.drop_duplicates()

Unnamed: 0,company_id,employee_count,follower_count,time_recorded,time_recorded_ts,date_recorded
0,81149246,6,91,1.692645e+09,2023-08-21 19:04:04.277973504,2023-08-21
1,10033339,3,187,1.692645e+09,2023-08-21 19:04:04.277973504,2023-08-21
2,6049228,20,82,1.692645e+09,2023-08-21 19:04:05.101318400,2023-08-21
3,2641066,45,2336,1.692645e+09,2023-08-21 19:04:05.923216640,2023-08-21
4,96649998,0,2,1.692645e+09,2023-08-21 19:04:05.924218880,2023-08-21
...,...,...,...,...,...,...
12546,5619,13650,321989,1.692869e+09,2023-08-24 09:15:21.000000000,2023-08-24
12547,74718032,655,6340,1.692869e+09,2023-08-24 09:15:21.000000000,2023-08-24
12548,38897,2009,13632,1.692869e+09,2023-08-24 09:15:21.000000000,2023-08-24
12549,2623,6769,58204,1.692869e+09,2023-08-24 09:15:21.000000000,2023-08-24


In [228]:
# Writing out the merged dataframe to a csv file
employee_counts_df.to_csv(r'D:\Veena\SJSU-Classes\Sem1\DatabaseSystemsforAnalytics-225-24\Lab2\OutputCSVFiles\merged_employee_counts.csv', sep=',', index=False,header=True)

In [215]:
# Connect to created empty collection using the previously established connection
collection_emp_count = db_ref['employee_counts']

# Convert merged dataframe to a list of dictionaries
emp_count_dict = employee_counts_df.to_dict(orient="records")

# Iterate over each document to convert dictionaries to JSON arrays
# This is to ensure that the list fields specialities and industry are arrays
for doc in emp_count_dict:
    # Iterate over each field in the document
    for field, value in doc.items():
        # If the value is a dictionary, convert it to list
        if isinstance(value, dict):
            doc[field] = list(value.values())

    # Insert the document into the collection
    collection_emp_count.insert_one(doc)

In [233]:
# Connect to created empty collection using the previously established connection to cloud instance of MongoDB atlas
collection_emp_count_atlas = db_ref_atlas['employee_counts']

# Convert merged dataframe to a list of dictionaries
emp_count_dict = employee_counts_df.to_dict(orient="records")

# Iterate over each document to convert dictionaries to JSON arrays
# This is to ensure that the list fields specialities and industry are arrays
for doc in emp_count_dict:
    # Iterate over each field in the document
    for field, value in doc.items():
        # If the value is a dictionary, convert it to list
        if isinstance(value, dict):
            doc[field] = list(value.values())

    # Insert the document into the collection to cloud instance of MongoDB atlas
    collection_emp_count_atlas.insert_one(doc)