In [1]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import configparser 
config = configparser.ConfigParser()
config.read('config.ini')
ip = config['DEFAULT']['IP']
port = config['DEFAULT']['MongoDB-Port']

from pymongo import MongoClient
client = MongoClient(ip, int(port))

In [2]:
#print(client.database_names())
db_twitter = client["Twitter"]
db_reddit  = client["reddit"]

collections_twitter = db_twitter.collection_names()
collections_reddit = db_reddit.collection_names()

In [4]:
for i in collections_twitter:
    print(i+": {:,}".format(db_twitter[i].find({}).count()))
    
for i in collections_reddit:
    print(i+": {:,}".format(db_reddit[i].find({}).count()))

twitter-temp: 0
twitter-australia: 312,347
twitter-other: 170,516,309
reddit: 1,369,789
family-violence-unique-2019-01-11: 259
reddit-2019-02: 319,715
family-violence-2019-01-11: 1,927,047
reddit_temp: 0


# 1 Twitter
## Compute number of posts based on creation date

In [4]:
# Construct pipline for aggregation
# Group by date, converted from timestamp

pipeline = [
    { "$group": {
        "_id": {
            "$dateToString": {
                "format": "%Y-%m-%d",
                "date": {
                    "$toDate": { "$toLong": "$timestamp_ms" }
                    }
                }
            },
        "count": { "$sum": 1 }
        }
    },
    { "$sort": {"_id": 1}} #sort by date ascending
]

In [5]:
# Australia data

aus_data = list(db_twitter['twitter-australia'].aggregate(pipeline))

In [6]:
# Print totals

total = 0
print('Tweet counts - Australia:\n')
for data in aus_data:
    if data['_id']:
        print(data['_id']+':', data['count'])
        total += data['count']
        
print('\nTotal:', total)

Tweet counts - Australia:


Total: 0


In [7]:
other_data = list(db_twitter['twitter-other'].aggregate(pipeline))

In [8]:
print('Tweet counts - other countries:\n')
total = 0
for data in other_data:
    if data['_id']:
        print(data['_id']+':', data['count'])
        total += data['count']

print('\nTotal:', total)

Tweet counts - other countries:

2019-01-04: 189
2019-01-09: 43440

Total: 43629


In [9]:
# Write results to file as csv

file_name = 'output/twitter-daily-collection.csv'
with open(file_name, 'w') as f:
    # header
    f.write('Date,Tweets from Australia,Tweets from Other Countries\n')
    
    # iterate each day (other data much larger than aus data)
    for data in other_data:
        if data['_id']:
            date = data['_id']
            
            # construct one csv line
            line = date + ','
            
            # find if this date also has data in aus_data
            aus_count = 0
            for _data in aus_data:
                if _data['_id'] == date:
                    aus_count = _data['count']
                    break
            line += str(aus_count) + ',' + str(data['count']) + '\n'
            f.write(line)
    
print (file_name, 'is ready.')

twitter-daily-collection.csv is ready.
