In [1]:
import csv
import re
import time
import datetime
import os

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import pandas as pd
from openpyxl import load_workbook

import configparser 
config = configparser.ConfigParser()
config.read('config.ini')
ip = config['DEFAULT']['IP']
port = config['DEFAULT']['MongoDB-Port']

from pymongo import MongoClient
client = MongoClient(ip, int(port))

In [2]:
db_twitter = client["Twitter"]
collections_twitter = db_twitter.collection_names()

In [3]:
current_timestamp = int(time.time() * 1000)
current_year = int(datetime.datetime.now().year)
print("current year : " + str(current_year))

current_week = int((current_timestamp - 1546214400000)/1000/604800)+1
print("current week : " + str(current_week))

current year : 2019
current week : 12


In [6]:
dic_collection = {}
for i in collections_twitter:
    if i.startswith("20"):
        year = i[0:4]
        week = re.search('_(.+?)_', i).group(1)[1:]
        try:
            if (int(year) == current_year) and (int(week) >= current_week-1):
                dic_collection[i] = "{:,}".format(db_twitter[i].find({}).count())
        except: pass

for key in sorted(dic_collection):
    print("%s: %s" % (key, dic_collection[key]))

2019_W11_Twitter_Australia: 68,593
2019_W11_Twitter_Other: 25,345,811
2019_W12_Twitter_Australia: 2,110
2019_W12_Twitter_Other: 597,613


# Extract a hashtag histogram from individual collections

## 1: iterate all records which have hashtag

In [None]:
pipeline = [
    {"$match": { "entities.hashtags": {"$exists":True,"$ne":[]}}},
    {"$match": { "lang" : "en"}},
    { "$group": {
        "_id": "$entities.hashtags",
        "count": { "$sum": 1 },
        }
    }
]

In [None]:
csv_columns = ['hashtag','count']

for collection in sorted(dic_collection):
    print("-------------------\n")
    print("Processing on collection: " + collection)
    
    hashtag_list={}
    data = list(db_twitter[collection].aggregate(pipeline,allowDiskUse=True))
    if len(data) > 0 : 
        for i in range(len(data)):
            for j in data[i]["_id"]:
                h = j["text"].lower()
                if(re.match("^[a-zA-Z0-9]*$",h)):
                    if len(hashtag_list)>0:
                        if h in hashtag_list:
                            hashtag_list[h] += data[i]["count"]
                        else:
                            hashtag_list[h] = data[i]["count"]
                    else:
                        hashtag_list[h] = data[i]["count"]
    print("hashtag_list is finished")
    
    #create folder if not exist
    folder = "output/hashtag_histogram/"
    if not os.path.exists(folder):
        os.makedirs(folder)
        
    csv_file = "output/hashtag_histogram/{" + collection + "}_hashtag_histogram.csv"
    try:
        with open(csv_file, 'w') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
            writer.writeheader()
            for key in hashtag_list.keys():
                csvfile.write("%s,%s\n"%(key,hashtag_list[key]))
        print("csvfile for collection "+ collection + "is done.")
        print("-------------------\n")
    except IOError:
        print("I/O error")
        print("-------------------\n")
        