In [1]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import configparser 
config = configparser.ConfigParser()
config.read('config.ini')
ip = config['DEFAULT']['IP']
port = config['DEFAULT']['MongoDB-Port']

from pymongo import MongoClient
client = MongoClient(ip, int(port))

In [2]:
#print(client.database_names())
db_twitter = client["Twitter"]
db_reddit  = client["reddit"]

collections_twitter = db_twitter.collection_names()
collections_reddit = db_reddit.collection_names()

In [3]:
dic_collection = {}
for i in collections_twitter:
    dic_collection[i] = "{:,}".format(db_twitter[i].find({}).count())
for i in collections_reddit:
    dic_collection[i] = "{:,}".format(db_reddit[i].find({}).count())

for key in sorted(dic_collection):
    print("%s: %s" % (key, dic_collection[key]))

family-violence-2019-01-11: 1,927,047
family-violence-unique-2019-01-11: 259
reddit: 1,563,998
reddit-2019-02: 6,962,647
reddit_temp: 217
twitter-australia: 436,247
twitter-other: 235,606,652
twitter-richard-2014: 211,755
twitter-richard-2015: 198,258
twitter-richard-2016: 10,801
twitter-richard-2017: 21,797,263
twitter-richard-2018: 10,679,787


# 1 Twitter
## Compute number of posts based on creation date

In [None]:
# Construct pipline for aggregation
# Group by date, converted from timestamp

pipeline = [
    { "$group": {
        "_id": {
            "$dateToString": {
                "format": "%Y-%m-%d",
                "date": {
                    "$toDate": { "$toLong": "$timestamp_ms" }
                    }
                }
            },
        "count": { "$sum": 1 }
        }
    },
    { "$sort": {"_id": 1}} #sort by date ascending
]

In [None]:
# Australia data

aus_data = list(db_twitter['twitter-australia'].aggregate(pipeline))

In [None]:
# Print totals

total = 0
print('Tweet counts - Australia:\n')
for data in aus_data:
    if data['_id']:
        print(data['_id']+':', data['count'])
        total += data['count']
        
print('\nTotal:', total)

In [None]:
other_data = list(db_twitter['twitter-other'].aggregate(pipeline))

In [None]:
print('Tweet counts - other countries:\n')
total = 0
for data in other_data:
    if data['_id']:
        print(data['_id']+':', data['count'])
        total += data['count']

print('\nTotal:', total)

In [None]:
# Write results to file as csv

file_name = 'output/twitter-daily-collection.csv'
with open(file_name, 'w') as f:
    # header
    f.write('Date,Tweets from Australia,Tweets from Other Countries\n')
    
    # iterate each day (other data much larger than aus data)
    for data in other_data:
        if data['_id']:
            date = data['_id']
            
            # construct one csv line
            line = date + ','
            
            # find if this date also has data in aus_data
            aus_count = 0
            for _data in aus_data:
                if _data['_id'] == date:
                    aus_count = _data['count']
                    break
            line += str(aus_count) + ',' + str(data['count']) + '\n'
            f.write(line)
    
print (file_name, 'is ready.')