In [1]:
import csv
import re
import time
import datetime
import os
import glob

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import configparser 
config = configparser.ConfigParser()
config.read('config.ini')
ip = config['DEFAULT']['IP']
port = config['DEFAULT']['MongoDB-Port']
contain_string = config['DEFAULT']['Contain-String']

from pymongo import MongoClient
client = MongoClient(ip, int(port))

# 1. connect to database

In [2]:
# connect to database and get collections' names
db_twitter = client["Twitter"]
collections_twitter = db_twitter.collection_names()

In [3]:
# get current year and current week number
current_timestamp = int(time.time() * 1000)
current_year = int(datetime.datetime.now().year)
print("current year : " + str(current_year))

current_week = int((current_timestamp - 1546214400000)/1000/604800)+1
print("current week : " + str(current_week))

current year : 2019
current week : 18


In [4]:
# list all collection and the number of records in each collection
dic_collection = {}
for i in collections_twitter:
    if i.startswith("20") and contain_string in i:
        year = i[0:4]
        week = re.search('_(.+?)_', i).group(1)[1:]
        if int(year) < current_year:
            dic_collection[i] = "{:}".format(db_twitter[i].find({}).count())
        else:
            try:
                if int(week) < current_week:
                    dic_collection[i] = "{:}".format(db_twitter[i].find({}).count())
            except: pass

for key in sorted(dic_collection):
    print("%s: %s" % (key, dic_collection[key]))

2019_W1_Twitter_Australia: 40880


# 2. create csv for each collection based on hashtag and user location

In [5]:
# write into csv file
def write_csv(file_name,hashtag,user_location):
    # avoid user location splitted by comma
    try:
        user_location = ''.join(user_location.split(','))
    except:
        pass
    row = "{},{}\n".format(hashtag,user_location)
    
    with open(file_name, 'a') as f:      
        f.write(row) 

In [6]:
# calculate running time
def calculate_time(start_time, t):
    current_time = time.time()
    duration = current_time - start_time
    if (duration/60) >= (t+10):
        t += 10
        print("The program is still running, already run for about "+ str(t) + " minutes.")
    return t

In [7]:
# create foler if not exist
def create_folder():
    folder = "output/hashtag_user_location/"
    if not os.path.exists(folder):
        os.makedirs(folder)
    return folder

In [8]:
# delete existed collection from the list dic_collection
def delete_collection(folder,dic_collection):
    for input_file in glob.glob(os.path.join(folder,'*.csv')):
        collection_name = re.search('_location/(.+?)_hashtag', input_file).group(1)
        print("Existed collection: " + collection_name)
        del dic_collection[collection_name]
    return dic_collection

In [9]:
#create folder if not exist
folder = create_folder()
dic_collection = delete_collection(folder,dic_collection)
for collection in sorted(dic_collection):
    
    print("-----------------------")
    print("processing on collection " + str(collection))
    start = time. time()
    t =0

    file_name = folder + str(collection) + "_hashtag_user_location.csv"
    with open(file_name, 'a') as f:
        f.write('hashtag,user_location\n')

    for document in db_twitter[collection].find():

        # twitter_id = document['id']
        user_location = document['user']['location']

        if len(document['entities']['hashtags']) == 0:
            hashtag = None
            write_csv(file_name,hashtag,user_location)
            t = calculate_time(start, t)
        elif len(document['entities']['hashtags']) == 1:
            hashtag = document['entities']['hashtags'][0]['text']
            write_csv(file_name,hashtag,user_location)
            t = calculate_time(start, t)
        else:
            for i in range(len(document['entities']['hashtags'])):
                hashtag = document['entities']['hashtags'][i]['text']
                write_csv(file_name,hashtag,user_location)
                t = calculate_time(start, t)

    print("csv file for collection " + collection + " is done.")
    print("-----------------------")

Existed collection: 2019_W1_Twitter_Australia
