In [1]:
import json
from pymongo import MongoClient
from bson.son import SON
from pprint import pprint
from random import randint
import os

In [2]:
def merge_two_dicts(x,y):
    '''return the merged dictionary'''
    z = x.copy()
    z.update(y)
    return z

In [3]:
# Connect to MongoDB
client = MongoClient('mongodb://localhost:27017/')
# Set db object point to careers database
db = client.careers

career_paths = ['data scientist_jobs','data analyst_jobs','data engineer_jobs','machine learning engineer_jobs']
# Looping json files and insert documents into careers database posts collection
i = 0
if False:
    for career in career_paths:
    
        with open(career) as f:
        
            for line in f:
            # Every line contains 16 posts of jobs
                for post in json.loads(line):
                # Insert job object to MongoDB via insert_one
                    result = db.posts.insert_one(merge_two_dicts(post,{'search':career[:-5]}))
                
                    i+=1
                
    print "{} jobs saved to database".format(str(i))


In [4]:
# Query how many companies in the database hiring data practitioners
num_companies = len(db.posts.distinct('company'))
print 'There are %d companies hiring data practitioners' % num_companies

There are 3865 companies hiring data practitioners


In [5]:
# Query how many companies in the database are the sponsor of Indeed
sponsors = set()
sp = db.posts.find({'sponsor':'Y'},{'company':1,'_id':0})
for firm in sp:
    sponsors.add(firm['company'])
    
num_sponsors = len(sponsors)
print 'There are %d companies advertise' % num_sponsors

There are 649 companies advertise


In [6]:
pipeline = [
    {"$group": {
        "_id": "$job ID",
        "location": {"$first": "$location"}
        }
    },
    {"$group": {
        "_id": "$location",
        "total_jobs":{"$sum": 1}
        }
    }
]
pprint(len(list(db.posts.aggregate(pipeline))))

706


In [17]:
# Create pipeline to query distinctive jobs by job ID
pipeline = [
    {"$group": {
        "_id": "$job ID",
        "jd": {"$first": "$jd"}
        }
    },
    {"$project":{
        "jd":1,
        "job ID": "$_id",
        "_id":0
    }
    }
]

In [18]:
def append_record(path,record):
    with open(path, 'a') as f:
        json.dump(record, f)
        f.write(os.linesep)

In [19]:
path = 'raw_data'
i = 0
for doc in db.posts.aggregate(pipeline):
    append_record(path,doc)
    i+=1
print "%d docs has been saved" %i

11920 docs has been saved


In [20]:
with open(path) as f:
    my_list = [json.loads(line) for line in f]      

In [26]:
pipeline2 = [
    {"$group": {
        "_id": "$job ID",
        "company": {"$first": "$company"}
        }
    },
    {"$project":{
        "job ID": "$_id",
        "company": 1,
        "_id":0
        }
    }
]

In [27]:
path2 = 'companies'
i = 0
for doc in db.posts.aggregate(pipeline2):
    append_record(path2,doc)
    i+=1
print "%d docs has been saved" %i

11920 docs has been saved


In [40]:
# Mapreduce
from bson.code import Code
mapper = Code("function () {"
              "  if(this.location.indexOf('CA') > -1) {"
              "    emit('Bay Area', 1);"
              "   }"
              "  else if(this.location.indexOf('WA') > -1) {"
              "    emit('Seattle', 1);"
              "   }"
              "  else if(this.location.indexOf('MA') > -1) {"
              "    emit('Boston', 1);"
              "   }"
              "  else if(this.location.indexOf('NY') > -1) {"
              "    emit('New York', 1);"
              "   }"
              "}")

In [41]:
reducer = Code("function (key, values) {"
               " var total = 0;"
               " for (var i = 0; i < values.length; i++) {"
               "   total += values[i];"
               " }"
               " return total;"
               "}")

In [42]:
result = db.posts.map_reduce(mapper,reducer,"myresults")
for doc in result.find():
    print doc

{u'_id': u'Bay Area', u'value': 11636.0}
{u'_id': u'Boston', u'value': 5129.0}
{u'_id': u'New York', u'value': 5565.0}
{u'_id': u'Seattle', u'value': 5677.0}
