# Loading tweets into mongodb

Sam Maurer, October 2018

This notebook loads tweets into MongoDB.

Make sure to launch the mongodb server first:  
`mongod --dbpath ~/mongodb-data`

In [1]:
import json
import pymongo
import time
import os
import zipfile

from bson import json_util  # special mongo json parsing

### Function definitions

In [8]:
def load_data(fpath, collection):
    """
    Load a single compressed JSON file into MongoDB.
    
    """
    t0 = time.time()
    data = []
    with zipfile.ZipFile(fpath) as z:
        with z.open(trim_zip(fpath)) as f:
            for line in f:
                data.append(json_util.loads(line))

    collection.insert_many(data)
    print(fpath.split('/')[-1])
    print(str(round(time.time()-t0,2)) + ' sec.')

In [9]:
def load_fpaths(fpaths, collection):
    """
    Load multiple files at once.
    
    """
    for fpath in fpaths:
        load_data(fpath, collection)

In [10]:
def trim_zip(fpath):
    """
    Return the name of the JSON file within the ZIP archive.
    
    """
    return fpath.split('/')[-1].split('.zip')[0]

In [11]:
def get_filepaths(dir_path):
    """
    Return list of filepaths in a directory (potentially only the ones after 
    a certain point).
    
    """
    flist = [dir_path + f for f in os.listdir(dir_path)]
    return flist

### Load data

In [5]:
client = pymongo.MongoClient()

In [7]:
db = client.tweets
collection = db.westcoastmonthly

In [12]:
path = '/Users/maurer/Dropbox/Data/Twitter/Westcoast-monthly-samples/json/'

In [13]:
flist = get_filepaths(path)
print(len(flist))

36


In [None]:
load_fpaths(flist, collection)

westcoast-20160601-192436.json.zip
164.29 sec.
westcoast-20160401-191211.json.zip
119.84 sec.
westcoast-20170101-175347.json.zip
127.39 sec.
westcoast-20160301-094718.json.zip
121.2 sec.
westcoast-20170902-122729.json.zip
136.28 sec.
westcoast-20180301-154248.json.zip
139.45 sec.
westcoast-20180601-095926.json.zip
147.59 sec.
westcoast-20170702-084313.json.zip
137.18 sec.
westcoast-20171004-112241.json.zip
135.61 sec.
westcoast-20180202-123000.json.zip
147.14 sec.
westcoast-20170601-140731.json.zip
144.26 sec.
westcoast-20151001-191836.json.zip
122.7 sec.
westcoast-20171201-230141.json.zip
151.2 sec.
westcoast-20180101-191327.json.zip
150.91 sec.
westcoast-20161101-084426.json.zip
127.39 sec.
westcoast-20160101-155746.json.zip
152.96 sec.
westcoast-20160501-094246.json.zip
134.9 sec.
westcoast-20160801-170131.json.zip
133.45 sec.
westcoast-20160701-225312.json.zip
118.35 sec.
westcoast-20180401-200911.json.zip
157.01 sec.
westcoast-20170401-060734.json.zip
156.15 sec.
westcoast-2016100

In [None]:
collection.count_documents({})

In [None]:
db.westcoastmonthly.drop()
collection = db.westcoastmonthly