In [1]:
%matplotlib inline
import pandas as pd
import os
import json
import pyap
import matplotlib.pyplot as plt
import seaborn as sns

from elasticsearch import (
    Elasticsearch,
    helpers
)
from datetime import datetime
from io import StringIO
from configparser import ConfigParser
from sqlalchemy import create_engine


data_dir = os.path.join(os.pardir,'data')
config_file = os.path.join(os.pardir,'config','config.ini')

def get_ini_vals(ini_file, section):
    config = ConfigParser()
    config.read(ini_file)
    return config[section]

es_creds = get_ini_vals(config_file, 'elasticsearch')
mysql_creds = get_ini_vals(config_file, 'mysql')

### Connect to ES


Newest indices are formatted like

`<issue_name>-YY.MM.DD`

In [3]:
es = Elasticsearch(
    [es_creds['host']],
    http_auth=('',''),
    port = es_creds['port'],
    use_ssl=False
)
print(es.info())

# print all indices
indices=es.indices.get_alias().keys()
print(indices)

{'name': 'pvnM2mO', 'cluster_uuid': 'lf55i5JCSBq8nk_ZrTyTKQ', 'cluster_name': 'elasticsearch', 'tagline': 'You Know, for Search', 'version': {'number': '5.2.1', 'lucene_version': '6.4.1', 'build_date': '2017-02-09T22:05:32.386Z', 'build_snapshot': False, 'build_hash': 'db0d481'}}
dict_keys(['civil_right-17.03.18', 'type', 'n-17.03.18', '17.03.09', 'twitter-ross', 'shakespeare', '17.03.12', 'healthcare-17.03.20', '17.03.14', 'civil_right-17.03.19', '.kibana', '17.03.07', 's-17.03.08', '17.03.06', '17.03.13', 'healthcare-17.03.19', 's-17.03.09', '17.03.15', '17.03.17', '17.03.16', 'civil_right-17.03.20', '17.03.08', '17.03.10', '17.03.11', 'healthcare-17.03.18', 's-17.03.10', 'twitter'])


### Connect To Rszt MySQL

In [4]:
# connect to openatrium database
engine = create_engine(
    """mysql+pymysql://{user}:{password}@{host}:{port}/{db}"""
    .format(user=mysql_creds['user'],
            password=mysql_creds['password'],
            host=mysql_creds['host'],
            port=mysql_creds['port'],
            db=mysql_creds['database']
            )
                      )

conn = engine.connect()

# table fields
rzst_event_header =[
    'elasticsearch_id',
    'title',
    'body_value',
    'location_text',
    'event_datetime_from',
    'event_datetime_to',
    'score',
    'insert_dt',
    'event_type',
    'pri_action_type',
    'sec_action_type'
]

# need to create a flow that parses out data we need first...
insert_vals= dict.fromkeys(rzst_event_header, "")

### Run Set of queries, plot scores of results


We have two main indices in Elasticsearch that are issue-specific

In [49]:
issues = ['civil_right', 'healthcare']
actions = {'charity': ["donat*",
                       "give",
                       "donate",
                       "give support",
                       "financial support"
                      ],
           'protest': ['protest', 
                       'march'
                      ],
           'petition': ['petition', 
                        'sign', 
                        'call'
                       ],
           'gathering': ["meetup",
                         "huddle",
                         "congregate",
                         "join us"
                        ],
           'boycott': ["boycott"],
           'advocate': ["call",
                        "email",
                        "reach out",
                        "senator",
                        "representative",
                        "sign petition",
                        "petition"
                       ],
           'vote': ["vote",
                    "cast your ballot"],
           'townhall': ["town hall",
                        "open office",
                        "town meeting",
                        "townhall",
                        'virtual townhall']
    
          }

In [13]:
# get number of results for one query
format = "%y.%m.%d"
index_ = issues[0] + "-"+datetime.now().date().strftime(format)
print(index_)

civil_right-17.03.19


In [50]:
# form query for one action
query = 'text: "' +  '" OR "'.join(actions['gathering']) + '"'
print(query)

text: "meetup" OR "huddle" OR "congregate" OR "join us"


In [51]:
results = es.search(index=index_,
           q=query,
           size=10000,    # have to manually set this, default is 10
           request_timeout=30)

In [52]:
import json

n_results = len(results['hits']['hits'])
for hit in results['hits']['hits']:
    print(hit['_score'], hit['_source'], hit['_id'])
    print("TEXT: %s \n" % hit['_source']['message'])
    print("USER: %s \n" % hit['_source']['user'])
    print("TIMESTAMP: %s \n" % hit['_source']['@timestamp'])

8.083777 {'urls': ['http://goo.gl/XGy51p', ''], '@timestamp': '2017-03-19T18:58:12.000Z', '@version': '1', 'tags': ['civil right'], 'source': 'http://twitter.com/exa_lemon/status/843537105544515584', 'symbols': [], 'user': 'exa_lemon', 'hashtags': [], 'client': '<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>', 'message': 'RT @jennymizkz: Local horny women are waiting to meet you online! dont wait, join us 100% Free right now:  https://t.co/0AALAfd2lo… https://t.co/tNbWadVlrp', 'type': 'civil_right', 'user_mentions': [{'id_str': '784695352767250432', 'indices': [3, 14], 'id': 784695352767250432, 'name': 'jenny paek', 'screen_name': 'jennymizkz'}], 'retweeted': False} AVrn87eUMNAj8foZ-HYB
TEXT: RT @jennymizkz: Local horny women are waiting to meet you online! dont wait, join us 100% Free right now:  https://t.co/0AALAfd2lo… https://t.co/tNbWadVlrp 

USER: exa_lemon 

TIMESTAMP: 2017-03-19T18:58:12.000Z 

7.7183886 {'urls': ['http://goo.gl/jkasVh', ''], '@timestamp': '

In [45]:
n_results

0

In [55]:
!pip install tqdm

Collecting tqdm
  Downloading tqdm-4.11.2-py2.py3-none-any.whl (46kB)
[K    100% |████████████████████████████████| 51kB 1.8MB/s ta 0:00:01
[?25hInstalling collected packages: tqdm
Successfully installed tqdm-4.11.2


In [77]:
import boto3
import csv
import os

from tqdm import tqdm


bucket = "mids-capstone-rzst"
session = boto3.Session(profile_name="berkeley")
s3 = session.client("s3", "us-west-2")

class TwitterQueryAction(object):
    def __init__(self, issue_key):
        self.issue = issue_key
        self.firstline = True
        self.outfile = self.issue + ".csv"
        self.buffer = open(self.outfile, "w")
        self.key_prefix = "/".join(["es_queries", datetime.now().date().isoformat()])
        self.line_count = 0
        
        
    @property
    def query(self):
        return 'message: "' +  '" OR "'.join(actions[self.action]) + '"'
    
    
    @property
    def s3_loc(self):
        return "/".join([self.key_prefix, self.outfile])
    
    
    def _write_tweets(self, results):
        if self.firstline:
            self.firstline = False
            fieldnames = ['issue',
                          'action',
                          'id',
                          'es_score',
                          'tweet_timestamp',
                          'query_timestamp',
                          'tweet_user',
                          'tweet'
                         ]
            self.writer = csv.DictWriter(self.buffer, 
                                        fieldnames=fieldnames
                                       )
            self.writer.writeheader()
        n_results = len(results['hits']['hits'])
        if n_results > 0:
            print("Writing %s results.\n" % n_results)
            for result in tqdm(results['hits']['hits']):
                row = {'issue': self.issue,
                       'action': self.action,
                       'id': result['_id'],
                       'es_score': result['_score'],
                       'tweet_timestamp': result['_source']['@timestamp'],
                       'query_timestamp': self.querytimestamp,
                       'tweet_user': result['_source']['user'],
                       'tweet': result['_source']['message']
                      }
                
                # skip retweets
                if row['tweet'][:2] == "RT":
                    continue
                self.writer.writerow(row)
                self.line_count += 1
        else:
            print("No results to save!")
            return None
        
    
    def run(self, action_key):
        self.action = action_key
        indices=[index for index in es.indices.get_alias().keys() if self.issue in index]
        self.querytimestamp = datetime.now().isoformat()
        for index_ in indices:
            print("Querying index %s" % index_)
            print(self.query)
            results = es.search(index=index_,
                                q=self.query,
                                size=10000,    # have to manually set this, default is 10
                                request_timeout=30)
            self._write_tweets(results)
        
    def stop(self):
        self.buffer.close()
        print("Uploading saved {line_count} results to s3://{bucket}/{s3_loc}".format(
                line_count = self.line_count,
                bucket = bucket,
                s3_loc = self.s3_loc
                )
             )
        s3.upload_file(self.outfile, bucket, "/".join([self.s3_loc
                                                      ])
                      )
        os.remove(self.outfile)
    

In [78]:
for issue in issues:
    tq = TwitterQueryAction(issue)

    for action in actions.keys():
        tq.run(action)
    tq.stop()

print("Done!")

100%|██████████| 1/1 [00:00<00:00, 2387.20it/s]
100%|██████████| 57/57 [00:00<00:00, 52006.81it/s]
100%|██████████| 17/17 [00:00<00:00, 22635.93it/s]

Querying index civil_right-17.03.18
message: "donat*" OR "give" OR "donate" OR "give support" OR "financial support"
Writing 1 results.

Querying index civil_right-17.03.19
message: "donat*" OR "give" OR "donate" OR "give support" OR "financial support"
Writing 57 results.

Querying index civil_right-17.03.20
message: "donat*" OR "give" OR "donate" OR "give support" OR "financial support"
Writing 17 results.

Querying index civil_right-17.03.18
message: "call" OR "email" OR "reach out" OR "senator" OR "representative" OR "sign petition" OR "petition"
No results to save!
Querying index civil_right-17.03.19
message: "call" OR "email" OR "reach out" OR "senator" OR "representative" OR "sign petition" OR "petition"



100%|██████████| 1585/1585 [00:00<00:00, 381103.64it/s]
100%|██████████| 136/136 [00:00<00:00, 223258.45it/s]
100%|██████████| 7/7 [00:00<00:00, 38887.59it/s]

Writing 1585 results.

Querying index civil_right-17.03.20
message: "call" OR "email" OR "reach out" OR "senator" OR "representative" OR "sign petition" OR "petition"
Writing 136 results.

Querying index civil_right-17.03.18
message: "town hall" OR "open office" OR "town meeting" OR "townhall" OR "virtual townhall"
No results to save!
Querying index civil_right-17.03.19
message: "town hall" OR "open office" OR "town meeting" OR "townhall" OR "virtual townhall"
No results to save!
Querying index civil_right-17.03.20
message: "town hall" OR "open office" OR "town meeting" OR "townhall" OR "virtual townhall"
No results to save!
Querying index civil_right-17.03.18
message: "meetup" OR "huddle" OR "congregate" OR "join us"
No results to save!
Querying index civil_right-17.03.19
message: "meetup" OR "huddle" OR "congregate" OR "join us"
Writing 7 results.




100%|██████████| 2/2 [00:00<00:00, 7250.31it/s]


Querying index civil_right-17.03.20
message: "meetup" OR "huddle" OR "congregate" OR "join us"
Writing 2 results.

Querying index civil_right-17.03.18
message: "petition" OR "sign" OR "call"
No results to save!
Querying index civil_right-17.03.19
message: "petition" OR "sign" OR "call"


100%|██████████| 1588/1588 [00:00<00:00, 341496.86it/s]
100%|██████████| 133/133 [00:00<00:00, 217652.14it/s]
100%|██████████| 1/1 [00:00<00:00, 4064.25it/s]
100%|██████████| 65/65 [00:00<00:00, 65191.24it/s]

Writing 1588 results.

Querying index civil_right-17.03.20
message: "petition" OR "sign" OR "call"
Writing 133 results.

Querying index civil_right-17.03.18
message: "vote" OR "cast your ballot"
Writing 1 results.

Querying index civil_right-17.03.19
message: "vote" OR "cast your ballot"
Writing 65 results.

Querying index civil_right-17.03.20
message: "vote" OR "cast your ballot"



100%|██████████| 26/26 [00:00<00:00, 86686.73it/s]
100%|██████████| 1/1 [00:00<00:00, 7570.95it/s]
100%|██████████| 108/108 [00:00<00:00, 101794.34it/s]
100%|██████████| 17/17 [00:00<00:00, 69632.00it/s]

Writing 26 results.

Querying index civil_right-17.03.18
message: "protest" OR "march"
Writing 1 results.

Querying index civil_right-17.03.19
message: "protest" OR "march"
Writing 108 results.

Querying index civil_right-17.03.20
message: "protest" OR "march"
Writing 17 results.

Querying index civil_right-17.03.18
message: "boycott"
No results to save!
Querying index civil_right-17.03.19
message: "boycott"



100%|██████████| 1/1 [00:00<00:00, 4258.18it/s]


Writing 1 results.

Querying index civil_right-17.03.20
message: "boycott"
No results to save!
Uploading saved 206 results to s3://mids-capstone-rzst/es_queries/2017-03-19/civil_right.csv


100%|██████████| 269/269 [00:00<00:00, 56785.33it/s]

Querying index healthcare-17.03.20
message: "donat*" OR "give" OR "donate" OR "give support" OR "financial support"
Writing 269 results.

Querying index healthcare-17.03.19
message: "donat*" OR "give" OR "donate" OR "give support" OR "financial support"



100%|██████████| 1292/1292 [00:00<00:00, 52911.54it/s]

Writing 1292 results.

Querying index healthcare-17.03.18
message: "donat*" OR "give" OR "donate" OR "give support" OR "financial support"
Writing 3 results.



100%|██████████| 3/3 [00:00<00:00, 15534.46it/s]
100%|██████████| 812/812 [00:00<00:00, 57344.97it/s]



Querying index healthcare-17.03.20
message: "call" OR "email" OR "reach out" OR "senator" OR "representative" OR "sign petition" OR "petition"
Writing 812 results.

Querying index healthcare-17.03.19
message: "call" OR "email" OR "reach out" OR "senator" OR "representative" OR "sign petition" OR "petition"


100%|██████████| 3720/3720 [00:00<00:00, 53753.49it/s]
100%|██████████| 9/9 [00:00<00:00, 20262.34it/s]
100%|██████████| 57/57 [00:00<00:00, 43642.81it/s]

Writing 3720 results.

Querying index healthcare-17.03.18
message: "call" OR "email" OR "reach out" OR "senator" OR "representative" OR "sign petition" OR "petition"
Writing 9 results.

Querying index healthcare-17.03.20
message: "town hall" OR "open office" OR "town meeting" OR "townhall" OR "virtual townhall"
Writing 57 results.

Querying index healthcare-17.03.19
message: "town hall" OR "open office" OR "town meeting" OR "townhall" OR "virtual townhall"



100%|██████████| 495/495 [00:00<00:00, 53838.67it/s]
100%|██████████| 3/3 [00:00<00:00, 10951.19it/s]
100%|██████████| 12/12 [00:00<00:00, 15563.28it/s]
100%|██████████| 50/50 [00:00<00:00, 39338.81it/s]
100%|██████████| 458/458 [00:00<00:00, 53818.32it/s]

Writing 495 results.

Querying index healthcare-17.03.18
message: "town hall" OR "open office" OR "town meeting" OR "townhall" OR "virtual townhall"
Writing 3 results.

Querying index healthcare-17.03.20
message: "meetup" OR "huddle" OR "congregate" OR "join us"
Writing 12 results.

Querying index healthcare-17.03.19
message: "meetup" OR "huddle" OR "congregate" OR "join us"
Writing 50 results.

Querying index healthcare-17.03.18
message: "meetup" OR "huddle" OR "congregate" OR "join us"
No results to save!
Querying index healthcare-17.03.20
message: "petition" OR "sign" OR "call"
Writing 458 results.

Querying index healthcare-17.03.19
message: "petition" OR "sign" OR "call"



100%|██████████| 2519/2519 [00:00<00:00, 55203.78it/s]
100%|██████████| 10/10 [00:00<00:00, 36503.95it/s]
100%|██████████| 797/797 [00:00<00:00, 52942.78it/s]

Writing 2519 results.

Querying index healthcare-17.03.18
message: "petition" OR "sign" OR "call"
Writing 10 results.

Querying index healthcare-17.03.20
message: "vote" OR "cast your ballot"
Writing 797 results.

Querying index healthcare-17.03.19
message: "vote" OR "cast your ballot"



100%|██████████| 2057/2057 [00:00<00:00, 54668.22it/s]
100%|██████████| 10/10 [00:00<00:00, 25763.54it/s]
100%|██████████| 86/86 [00:00<00:00, 47486.85it/s]
100%|██████████| 282/282 [00:00<00:00, 50866.29it/s]

Writing 2057 results.

Querying index healthcare-17.03.18
message: "vote" OR "cast your ballot"
Writing 10 results.

Querying index healthcare-17.03.20
message: "protest" OR "march"
Writing 86 results.

Querying index healthcare-17.03.19
message: "protest" OR "march"
Writing 282 results.

Querying index healthcare-17.03.18
message: "protest" OR "march"



100%|██████████| 4/4 [00:00<00:00, 13888.42it/s]
100%|██████████| 4/4 [00:00<00:00, 21509.25it/s]


No results to save!
Querying index healthcare-17.03.20
message: "boycott"
Writing 4 results.

Querying index healthcare-17.03.19
message: "boycott"
Writing 4 results.

Querying index healthcare-17.03.18
message: "boycott"
No results to save!
Uploading saved 12908 results to s3://mids-capstone-rzst/es_queries/2017-03-19/healthcare.csv
Done!
