### Select language using one of the following codes:
- en (English, default option)
- de (German)
- es (Spanish)
- fr (French)
- pt (Portuguese)


In [1]:
language = 'en'

In [2]:
language_ref = { 'en' : { 'name' : 'English', 'batch_size' : 50000},
                 'de' : { 'name' : 'German', 'batch_size' : 10000},
                 'es' : { 'name' : 'Spanish', 'batch_size' : 50000},
                 'fr' : { 'name' : 'French', 'batch_size' : 25000},
                 'pt' : { 'name' : 'Portuguese', 'batch_size' : 10000},
               }

### Keywords

In [3]:
import json
import os
import re

keywords_file = os.path.join("..", "keywords", language + ".json")

climate_dict = []
health_dict = []
compound_terms = []

def normalise_keywords(dict): #lowercases and handles compounds
    for i in range(0, len(dict)):
        keyword = dict[i].lower()
        compound = keyword.replace(' ','_')
        if compound != keyword:
            keyword = compound
            words = tuple(compound.split('_'))
            compound_terms.append(words)
        dict[i] = keyword
    return dict

with open(keywords_file) as f:
    data = json.load(f)

climate_dict= normalise_keywords(data['climate'])
health_dict = normalise_keywords(data['health'])

In [4]:
health_dict

['malaria',
 'diarrhoea',
 'infection',
 'disease',
 'sars',
 'measles',
 'pneumonia',
 'epidemic',
 'pandemic',
 'public_health',
 'healthcare',
 'epidemiology',
 'health_care',
 'health',
 'mortality',
 'morbidity',
 'nutrition',
 'illness',
 'infectious',
 'ncd',
 'non-communicable_disease',
 'noncommunicable_disease',
 'communicable_disease',
 'air_pollution',
 'nutrition',
 'malnutrition',
 'mental_disorder',
 'stunting']

In [5]:
climate_dict

['climate_change',
 'global_warming',
 'green_house',
 'temperature',
 'extreme_weather',
 'global_environmental_change',
 'climate_variability',
 'greenhouse',
 'low_carbon',
 'ghge',
 'renewable_energy',
 'carbon_emission',
 'co2_emission',
 'climate_pollutant']

In [6]:
twitter_credentials = "twitter_credentials.json"

tweets_folder = os.path.join("..", "data", "tweets", language)
tweets_climate_filename_prefix = "tweets_climate." + language + "."
tweets_health_filename_prefix = "tweets_health." + language + "."

In [7]:
from TwitterSearch import *

with open(twitter_credentials) as f:
    credentials = json.load(f)

def get_tweets(keywords, dump):
    try:
        tso = TwitterSearchOrder()
        tso.set_keywords(keywords, or_operator = True)
        tso.set_language(language)
        tso.set_include_entities(True)

        ts = TwitterSearch(
            consumer_key = credentials["consumer_key"],
            consumer_secret = credentials["consumer_secret"],
            access_token = credentials["access_token"],
            access_token_secret = credentials["access_token_secret"]
        )

        for tweet in ts.search_tweets_iterable(tso):
            dump[tweet['id']] = tweet

    except TwitterSearchException as e:
        print(e)

In [8]:
import pandas as pd
import csv
import time

def store_tweets(tweets, filename):
    with open(filename, 'w') as csvfile:
        fieldnames = ['created_at', 'id', 'id_str', 'text', 'truncated', 'entities', 'extended_entities', 
                      'metadata', 'source', 'in_reply_to_status_id', 'in_reply_to_status_id_str', 
                      'in_reply_to_user_id', 'in_reply_to_user_id_str', 'in_reply_to_screen_name', 'user', 
                      'geo', 'coordinates', 'place', 'contributors', 'retweeted_status', 'is_quote_status', 
                      'retweet_count', 'favorite_count', 'favorited', 'retweeted', 'possibly_sensitive', 'lang', 
                      'quoted_status_id', 'quoted_status_id_str', 'quoted_status', 'withheld_in_countries']
        w = csv.DictWriter(csvfile, fieldnames=fieldnames)
        w.writeheader()
        w.writerows(tweets.values())

def load_tweets(filename):
    tweets = {}
    with open(filename) as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            tweets[row['id']] = row
    return tweets

In [None]:
import time

batch_threshold = language_ref[language]["batch_size"]
iterations = 10

for i in range(1, iterations):
    current_tweets_climate = {}
    current_tweets_health = {}
    timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
    print(timestamp)
    
    tweets_climate_filename = os.path.join(tweets_folder, tweets_climate_filename_prefix + timestamp + ".csv")
    tweets_health_filename = os.path.join(tweets_folder, tweets_health_filename_prefix + timestamp + ".csv")

    while len(current_tweets_health) < batch_threshold:
        get_tweets(climate_dict, current_tweets_climate)
        get_tweets(health_dict, current_tweets_health)
        time.sleep(600)

    store_tweets(current_tweets_climate, tweets_climate_filename)
    store_tweets(current_tweets_health, tweets_health_filename)
    print("Climate tweets: %d - Health tweets: %d" % (len(current_tweets_climate), len(current_tweets_health)))

In [None]:
from os import listdir

tweets_climate = {}
tweets_health = {}

files = listdir(tweets_folder)
for f in files:
    if tweets_health_filename_prefix in f:
        print("Loading %s" % f)
        for (tweet_id, tweet) in load_tweets(os.path.join(tweets_folder, f)).items():
            tweets_health[tweet_id] = tweet
    if tweets_climate_filename_prefix in f:
        print("Loading %s" % f)
        for (tweet_id, tweet) in load_tweets(os.path.join(tweets_folder, f)).items():
            tweets_climate[tweet_id] = tweet

In [None]:
print("Climate tweets: %d - Health tweets: %d" % (len(tweets_climate), len(tweets_health)))