# Tweet pre-labeling pipeline 
#### Purpose is to find, label, and predict roads and muni routes/stops in tweets

In [1]:
# External library imports
from bs4 import BeautifulSoup # only needed to scrape street names and muni names
import pandas as pd
import numpy as np

# SpaCy imports
import spacy
from spacy.pipeline import EntityRuler
from spacy import displacy
from spacy.tokens import Span, Doc

# Standard imports
import requests # only needed to scrape street names and muni names
import os
from ast import literal_eval as lit # for decoding string literals

## Strategy and Stages to Train NER

1. Exact match road names and muni lines from external source to make match patterns
2. Format tweets that exactly match the patterns as training data for NER
3. Train NER using training data to get better context for matches to generalize

## Order of operations

1. Load standard NLP model
2. Use NLP model or other methods to clean up road and muni stop/route names
3. Make patterns from road and muni stop/route names
4. Add patterns to EntityRuler() (could use Matcher() pipeline for similar results)
5. Add EntityRuler() to NLP to exact-match patterns
6. Use NLP+ER to examine tweets
7. Extract training tuples from parsed tweets to train the NER properly
8. ...

## To Do List
1. [x] Make match patterns from street names  
2. [x] Make muni patterns from muni stops and lines to be used by `EntityRuler()`
2. [x] Make match pipeline to roughly label tweets
2. [x] Make training tuples for docs that match, including char_start_index, char_end_index, and entity label, to train NER.
4. [ ] Train NER to be more flexible than ridged match patterns
3. ....

URLs to get road names and muni stops:  
road names: https://geographic.org/streetview/usa/ca/san_francisco.html  
muni stops: https://www.sfmta.com/getting-around/muni/routes-stops
  
  
Other useful URLs:  
match patterns: https://spacy.io/usage/rule-based-matching#adding-patterns-attributes  
hashtags and emojis (could apply to mentions as well): https://spacy.io/usage/rule-based-matching#example3  
part-of-speech and other abbrieviations: https://spacy.io/api/annotation

### Load SpaCy NLP model

In [2]:
# load premade model
nlp = spacy.load('en_core_web_sm')

### Load or make patterns from webscraped list of street names - could be streamlined further

In [19]:
# Check if patterns already exist before scraping
if os.path.exists('road_patterns.txt'):
    with open('road_patterns.txt', 'r') as f:
        road_patterns = lit(f.read())
        
# Scrape street name website
else:
    street_url = 'https://geographic.org/streetview/usa/ca/san_francisco.html'
    response = requests.get(street_url)
    print('response:', response.status_code)
    
    # Parse for list items from list of streets
    soup = BeautifulSoup(response.text, 'lxml')
    lis = soup.findAll('li')
    # Remove San Francisco from matching
    lis = [li.text for li in lis if li.text[:6] != 'San Fr']
        
    # Instantiate list for patterns
    road_patterns = []
    # And characters used in matching
    characters = []
    
    # Make lists for coditional statements to test against
    # Accounting for 3 scenarios:
    # 1. Road name is a common word like "High St." or "D Ave."
    #    -> check against alphabet and common_word lists
    # 2. Road name contains weird characters
    #    -> check against list of weird characts, output a list of characters used allowed through
    # 3. Road deisgnator (ex: Street, St., Rd.) is not often used in text (make optional)
    #    -> make optional IF not part of a common_word road name.
    # 4. Road has unnecessary portion after a dash.
    #    -> break loop when (text == '-').
    
    common_words = ['high', 'day', 'still', 'signal', 'lane', 'guy',
                    'fell', 'auto', 'sacramento', 'service', 'clipper',
                    'francisco', 'green', 'cross', 'main', 'white',
                    'short', 'hill', 'front', 'park', 'head', 'bay', 'long']
    bad_chars = ['&', '/', '½', '¿', 'ï', ':']
    road_designators = ['rd', 'pl', 'st', 'ln', 'dr', 'ct', 'loop', 'way', 'cir', 'ave', 'blvd',
                        'road', 'place', 'street', 'drive', 'court', 'circle', 'avenue']
    alphabet = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 
               'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
    
    for li in lis:
        # Instantiate pattern for each list item (street name)
        pattern = []
        # Instantiate booleans for bad characters test and,
        # boolean streets that require road designator (e.g. St., Ln., Ct., Dr., etc.)
        bad_char = False
        specific_street = False
        
        # Check for common word street names (Note that this method accounts for cases like
        # D Ave AND Avenue D, both of which exist in SF)
        for text in li.split(' '):
            text = text.lower()
            if (text in alphabet) or (text in common_words):
                specific_street = True
        
        # Add street names as a pattern, depending on bad_char, specific_street, and road_designator conditions
        for text in li.split(' '):
            text = text.lower()
            
            # skip blank tokens
            if text == '':
                continue
            
            # If character is a dash, ignore everything to the right of it
            if '-' in text:
                break
                
            # Check for bad characters
            for char in text:
                if char in bad_chars:
                    bad_char = True
                    # break inner loop checking chars
                    break
                elif char not in characters:
                    characters.append(char)
            # break outer loop adding text to pattern matching
            if bad_char:
                break
                
            # If street has a common name (e.g. "A st"), make street designator required
            if text in road_designators:
                if specific_street:
                    pattern.append({'LOWER': text})
                # If street doesn't have common name, make street designator optional
                else:
                    pattern.append({'LOWER': text, 'OP':'?'})
            # If part of road name passes all other tests, part is added to pattern matcher
            else:
                pattern.append({'LOWER': text})
            
        # Add each pattern to the list of patterns, prevent adding blanks
        # Format of dictionary is to meet requirements for EntityRuler pipeline
        if pattern:
            road_patterns.append({'label':'ROAD', 'pattern':pattern})
            
#     # Display change from original text to tokens in match pattern
#     for li, pattern_ in zip(lis[-20:-15], road_patterns[-20:-15]):
#         display(li.split(' '), pattern_)
    
    # Display all characters used in matches
    characters.sort()
    print('\nAll characters included in matching patterns, look out for weird symbols:')
    print(characters)
    
    # Get rid of duplicates
    road_patterns_legit = []
    for pattern in road_patterns:
        if pattern not in road_patterns_legit:
            road_patterns_legit.append(pattern)
    road_patterns = road_patterns_legit
    
    print('\nNumber of road patterns:', len(road_patterns))
    
    # Save patterns for later to avoid scraping over and over on reruns of code.
    with open('road_patterns.txt', 'w+') as f:
        f.write(str(road_patterns))

response: 200

All characters included in matching patterns, look out for weird symbols:
["'", '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']

Number of road patterns: 2416


### Scrape for Muni lines

In [20]:
# Check if patterns already exist before scraping
if os.path.exists('muni_patterns.txt'):
    with open('muni_patterns.txt', 'r') as f:
        muni_patterns = lit(f.read())

# If not, scrape sfmta website for muni routes and stops
else:
    # go to muni website
    street_url = 'https://www.sfmta.com/getting-around/muni/routes-stops'
    response = requests.get(street_url)
    print('response:', response.status_code)
    
    # Parse for list items from list of muni lines
    soup = BeautifulSoup(response.text, 'lxml')
    lis = soup.findAll("span", {"class":"field-content"})

    # instantiate patterns list, each pattern is a list of dictionaries describing the match
    muni_patterns = []
    # inst. list for muni numbers, e.g. E, 1AX, KT, etc.

    # -5 index to leave out bad results from web scrape
    for li in lis[:-5]:
        # instantiate list of token match dictionaries
        muni_pattern = []
        # extract tokens
        doc = nlp(li.text)

        # optionally match inbound or outbound keywords
        muni_pattern.append({'LOWER':{'IN':['ob', 'outbound', 'ib', 'inbound']}, 'OP':'?'})
        # first part of muni name ('E Embarcadero') optional with this phrasing
        muni_pattern.append({'LOWER':doc[0].text.lower(), 'OP':'?'})
        
        # Pattern type A examples: 'Embarcadero', 'IB Fulton', 'Outbound 9 San Bruno'
        if len(doc) > 1:
            for token in doc[1:]:
                # Assign optional match patterns depending on part of speech
                if token.pos_ in ['CCONJ', 'CONJ', 'SYM', 'PUNCT']:
                    token_pattern = {'POS':token.pos_, 'OP':'?'}
                # If token doesn't fall into above part of speech options, add as a word (required pattern)
                else:
                    token_pattern = {'LOWER':token.text.lower()}

                # make list of token patterns
                muni_pattern.append(token_pattern)

        # Pattern type B examples: 'The K line', 'OB E line'
        busline_abbr_name = [
            {'POS':'DET', 'OP':'?'},         # conditional match "The", "A", "An"
            {'LOWER':{'IN':['ob', 'outbound', 'ib', 'inbound']}, 'OP':'?'},# inbound or outbound option
            {'LOWER':doc[0].text.lower()},   # match muni line abbriev.
            {'LOWER':{'IN':['line', 'bus']}} # match keyword line or bus
        ]
        # Pattern type C examples: 'The K', 'the OB E', 'a T bus', 'the D car'
        the_abbr_name = [
            {'LOWER':'the'},                                               # match The
            {'LOWER':{'IN':['ob', 'outbound', 'ib', 'inbound']}, 'OP':'?'},# inbound or outbound option
            {'LOWER':doc[0].text.lower()},                                 # match muni line abbriev
            {'LOWER':{'IN':['line', 'bus', 
                            'cable', 'car', 
                            'cablecar', 'train',
                            '']}, 'OP':'?'}                     # conditional match keyword line or bus
        ]

        # add to major list, formatting for the Entity
        for name_pattern in [busline_abbr_name, the_abbr_name]:
            muni_patterns.append({'label':'MUNI', 'pattern':name_pattern})
        muni_patterns.append({'label':'MUNI', 'pattern':muni_pattern})


    print('Number of muni patterns:', len(muni_patterns))
    # Save patterns for later to avoid scraping over and over on reruns of code.
    with open('muni_patterns.txt', 'w+') as f:
        f.write(str(muni_patterns))

In [21]:
web_patterns = []
web_patterns.append({'label':'HTAG','pattern':[{"ORTH": "#"}, {"IS_ASCII": True}]})

# mentions show up as one token: '@whoever', whereas hashtags parse to two tokens: '#', 'whatever'
web_patterns.append({'label':'MENTN','pattern':[{"TEXT":{"REGEX":"@.\S+"}}]})

In [22]:
# show head of muni_patterns and road_patterns
display(muni_patterns[:3])
print('-'*50)
display(road_patterns[:3])

[{'label': 'MUNI',
  'pattern': [{'POS': 'DET', 'OP': '?'},
   {'LOWER': 'e'},
   {'LOWER': {'IN': ['line', 'bus']}}]},
 {'label': 'MUNI',
  'pattern': [{'LOWER': 'the'},
   {'LOWER': {'IN': ['ob', 'outbound', 'ib', 'inbound']}, 'OP': '?'},
   {'LOWER': 'e'},
   {'LOWER': {'IN': ['line', 'bus']}, 'OP': '?'}]},
 {'label': 'MUNI',
  'pattern': [{'LOWER': {'IN': ['ob', 'outbound', 'ib', 'inbound']},
    'OP': '?'},
   {'LOWER': 'e', 'OP': '?'},
   {'LOWER': 'embarcadero'}]}]

--------------------------------------------------


[{'label': 'ROAD',
  'pattern': [{'LOWER': '10th'}, {'LOWER': 'ave', 'OP': '?'}]},
 {'label': 'ROAD', 'pattern': [{'LOWER': '10th'}, {'LOWER': 'st', 'OP': '?'}]},
 {'label': 'ROAD',
  'pattern': [{'LOWER': '11th'}, {'LOWER': 'ave', 'OP': '?'}]}]

In [7]:
# add road and muni patterns to NER
ruler = EntityRuler(nlp, patterns=[*road_patterns, *muni_patterns, *web_patterns], overwrite_ents=True)
nlp.add_pipe(ruler)

## Load tweet data

### Pull in tweets from .txt file

Load raw tweets into list, tweets are formatted as rows of byte-strings in .txt file.  
If loading tweets from api, will need to add a step to extract tweet text for tweets that meet XYZ criterea.

In [10]:
with open('munitweet.txt') as file:
    raw_corpus = file.readlines()
    # lit() interperets byte-strings, then they are decoded
    raw_corpus = [lit(x).decode('utf-8').replace('\n', '') for x in raw_corpus]

In [14]:
# Look at raw corpus, get an example tweet
corpus = []
print('Number of tweets with exact duplicates:',len(raw_corpus))
display(raw_corpus[200])

# Remove duplicates
for tweet in raw_corpus:
    if tweet not in corpus:
        corpus.append(tweet)
print('-'*50)

# Look again after removing dups
print('Number of tweets without exact duplicates:', len(corpus))
display(corpus[200])

Number of tweets with exact duplicates: 500


"Happy Thanksgiving! We're spending ours with some close friends. We hope you are with your tightest connections this Thanksgiving. @VTA @rideact @ACE_train @SFBART @sfmta_muni @Caltrain https://t.co/KziQOt9jmH"

--------------------------------------------------
Number of tweets without exact duplicates: 343


'Just noticed FOUR @sfmta_muni fire inspectors chatting up a storm at West Portal.  Don’t seem to be doing any work so what R they needed for?  More importantly, why 4?  If inspecting trains, numerous trains came and went w/o them getting on!'

## Running NER with exact match patterns
Note that the NER has not been trained, it's looking exactly for the match phrases using the EntityRuler(), and also matching any other pre-trained entities from the 'en_core_web_sm' model.

In [15]:
spacy_docs = list(nlp.pipe(corpus))

In [16]:
# Filter docs for symbols, stop words, punctuation, and urls
spacy_texts = [[token.text.lower() for token in doc 
                if not token.is_stop 
                and not token.is_punct 
                and not token.text.startswith('http')
                and not token.text.startswith('@')
                and not token.text.startswith('#')
                and not token.like_url
               ] 
               for doc in spacy_docs]

# Make docs from text. Check for empty lists of strings.
filtered_docs = [Doc(nlp.vocab, words=words) for words in spacy_texts if words]


In [17]:
filtered_docs[:10]

[sf muni l taraval san francisco ,
 like rate ride feature muni mobile app add option types feedback   example arrival predictions 24 line minutes whack tonight   wondering new metro cars jerky ,
 people live work san francisco proud supporting proposed standard zero emission buses ,
 35 min+ n embarcadero plenty double cars going unacceptable ,
 thing add angled parking city core anytime ’s street project ,
 like eta n 21 mins means 45 mins n ob special place hell muni leadership ,
 unpredictable shitty hate guys ,
 wait 4th&amp;king 20 mins train w/o marked route arrives people jump w/ assumption ’s n ob embarcadero muni says f u switching k. hop k waiting tracks n 10 mins ,
 sooooo njudah difficult track ’s early evening embarcadero station predicted wait time 28 min staff said screens incorrect ’s 10 min ’m waiting y’ crystal ballin ,
 pls send 31 ]

In [18]:
# Using DisplaCy to render entity matches, according to the nlp + added EntityRuler (exact match)
# Choose which docs to use - note: filtered_docs doesn't work at the moment
docs = spacy_docs
# docs = filtered_docs # Not working yet

try:
    num_tweets = int(input('How many tweets do you want to look at? '))
    num_start = int(input('Starting from? '))
    
#     [num_start:num_start+num_tweets]
    # options excludes viewing other entities, comment it out to see all entities.
    ents = ["ROAD", "MUNI", "HTAG", "MENTN"]
    colors = ["yellow", "lightblue", "green", "pink"]
    options = {"ents": ents, 
               "colors": dict(zip(ents, colors))}

    if options:
        for doc in docs[num_start:num_start+num_tweets]:
            displacy.render(doc, style="ent", options=options)
            print('-'*50)
    else:
        for doc in docs[num_start:num_start+num_tweets]:
            displacy.render(doc, style="ent")
            print('-'*50)
except:
    print('Error while displaying results, \nTry entering number as an integer')

How many tweets do you want to look at?  6
Starting from?  210


--------------------------------------------------


--------------------------------------------------


--------------------------------------------------


--------------------------------------------------


--------------------------------------------------


--------------------------------------------------


### Desired format for training data

```
TRAINING_DATA = [
    ("first_string_being_labelled", {"entities": [(start_char_index, end_char_index, "LABEL"), ...]}),
    ("second_string_being_labelled", {"entities": [(start_char_index, end_char_index, "LABEL"), ...]}),
    ...
]
```
Problems:  
* Need to get character start index for recognized entity, not token index
    * Solved! Use doc.to_json to output a json with all necessary information

In [17]:
# Note NLP pipeline needs to have EntityRuler in it at this point.
spacy_docs = nlp.pipe(corpus)
spacy_docs = list(spacy_docs)

In [18]:
TRAINING_DATA = []
for doc in spacy_docs:
    ents = [ent.label_ for ent in doc.ents]
    if 'MUNI' in ents:
        json_doc = doc.to_json()
        training_tuple = (json_doc['text'], {'entities': json_doc['ents']})
    TRAINING_DATA.append(training_tuple)

In [19]:
TRAINING_DATA[:3]

[("I'm at SF MUNI - L Taraval - @sfmta_muni in San Francisco, CA https://t.co/3N1JXENWs4",
  {'entities': [{'start': 7, 'end': 14, 'label': 'ORG'},
    {'start': 17, 'end': 26, 'label': 'MUNI'},
    {'start': 29, 'end': 40, 'label': 'MENTN'},
    {'start': 44, 'end': 57, 'label': 'GPE'}]}),
 ("I'm at SF MUNI - L Taraval - @sfmta_muni in San Francisco, CA https://t.co/3N1JXENWs4",
  {'entities': [{'start': 7, 'end': 14, 'label': 'ORG'},
    {'start': 17, 'end': 26, 'label': 'MUNI'},
    {'start': 29, 'end': 40, 'label': 'MENTN'},
    {'start': 44, 'end': 57, 'label': 'GPE'}]}),
 ('@sfmta_muni I like the Rate My Ride feature in the Muni Mobile app, but can you add an option to give other types of feedback?  For example, arrival predictions on the 24 line are several minutes out of whack tonight.  Or wondering if the new Metro cars will always be so jerky?',
  {'entities': [{'start': 0, 'end': 11, 'label': 'MENTN'},
    {'start': 164, 'end': 175, 'label': 'MUNI'},
    {'start': 180, 'end'