In [1]:
%matplotlib inline

# Imports

In [2]:
from __future__ import print_function
import json
import nltk
import matplotlib

# Load a test tweet

In [3]:
text = u"Whaaaa.... Metro's shiny new silver cars have seats with lumbar support. This is thrilling. @wmata thanks for having my back"

# Get the sentences

In [4]:
for sent in nltk.sent_tokenize(text): 
    print(sent)
    print()

Whaaaa.... Metro's shiny new silver cars have seats with lumbar support.

This is thrilling.

@wmata thanks for having my back



# Get the words

In [5]:
for sent in nltk.sent_tokenize(text):
    print(list(nltk.wordpunct_tokenize(sent)))
    print()

[u'Whaaaa', u'....', u'Metro', u"'", u's', u'shiny', u'new', u'silver', u'cars', u'have', u'seats', u'with', u'lumbar', u'support', u'.']

[u'This', u'is', u'thrilling', u'.']

[u'@', u'wmata', u'thanks', u'for', u'having', u'my', u'back']



# Get the parts of speech

In [6]:
for sent in nltk.sent_tokenize(text):
    print(list(nltk.pos_tag(nltk.word_tokenize(sent))))
    print()

[(u'Whaaaa', 'NNP'), (u'...', ':'), (u'.', '.'), (u'Metro', 'NNP'), (u"'s", 'POS'), (u'shiny', 'JJ'), (u'new', 'JJ'), (u'silver', 'NN'), (u'cars', 'NNS'), (u'have', 'VBP'), (u'seats', 'NNS'), (u'with', 'IN'), (u'lumbar', 'JJ'), (u'support', 'NN'), (u'.', '.')]

[(u'This', 'DT'), (u'is', 'VBZ'), (u'thrilling', 'VBG'), (u'.', '.')]

[(u'@', 'NN'), (u'wmata', 'NNS'), (u'thanks', 'NNS'), (u'for', 'IN'), (u'having', 'VBG'), (u'my', 'PRP$'), (u'back', 'NN')]



In [7]:
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import PorterStemmer

text = list(nltk.word_tokenize(text))

snowball = SnowballStemmer('english')
lancaster = LancasterStemmer()
porter = PorterStemmer()

for stemmer in (snowball, lancaster, porter):
    stemmed_text = [stemmer.stem(t) for t in text]
    print(" ".join(stemmed_text))

whaaaa ... . metro 's shini new silver car have seat with lumbar support . this is thrill . @ wmata thank for have my back
whaaa ... . metro 's shiny new silv car hav seat with lumb support . thi is thrilling . @ wmat thank for hav my back
Whaaaa ... . Metro 's shini new silver car have seat with lumbar support . Thi is thrill . @ wmata thank for have my back


In [8]:
from nltk.stem.wordnet import WordNetLemmatizer

# Note: use part of speech tag, we'll see this in machine learning! 
lemmatizer = WordNetLemmatizer()
lemmas = [lemmatizer.lemmatize(t) for t in text]
print(" ".join(lemmas))

Whaaaa ... . Metro 's shiny new silver car have seat with lumbar support . This is thrilling . @ wmata thanks for having my back


In [9]:
print(nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize("Whaaaa ... . Metro 's shiny new silver car have seat with lumbar support . This is thrilling . @ wmata thanks for having my back"))))

(S
  (GPE Whaaaa/NNP)
  .../:
  ./.
  (PERSON Metro/NNP)
  's/POS
  shiny/JJ
  new/JJ
  silver/NN
  car/NN
  have/VBP
  seat/VBN
  with/IN
  lumbar/JJ
  support/NN
  ./.
  This/DT
  is/VBZ
  thrilling/VBG
  ./.
  @/JJ
  wmata/JJ
  thanks/NNS
  for/IN
  having/VBG
  my/PRP$
  back/NN)


# Load all the tweets

In [10]:
anansi = []
PATH   = "data/stream_wmata.json"
with open(PATH, 'r') as data_file:
    for line in data_file:
        tweet = json.loads(line)
        anansi.append((tweet['text'])) # Just want the text field for now

In [11]:
sample = anansi[0:100]
print(sample)

[u"Whaaaa.... Metro's shiny new silver cars have seats with lumbar support. This is thrilling. @wmata thanks for having my back", u'RT @unsuckdcmetro: Weihle-Reston station this morning. Operators are doing this. #wmata https://t.co/QASKjIaZIP', u'crazy how i know the MTA better than WMATA.', u"RT @TriathlonBadBoy: I'm dressed &amp; ready to take WMATA train this afternoon into DC @FixWMATA @Pat_Lewis360 @DeguelloBBQ #wmata #dcmetro ht\u2026", u'crazy how i know the @MTA better than @wmata\U0001f643', u'Yes, he solved it fast. MT @SRRas: @wmata Kudos to driver of train with medical emergency this a.m. professional and courteous throughout.', u'Police Week is coming. Saw police officers with black striped shields last night. https://t.co/NiCdr1wGvX', u"Ever wonder why Metro platform displays (PIDs) do the weird things they do? After today's meeting with #WMATA I know a lot more about them.", u'DC Metro is TOO expensive for me to be paying all this money for escalators that never work or

In [13]:
tagged_corpus = []
for text in sample:
    temp = []
    for sent in nltk.sent_tokenize(text):
        temp.append(list(nltk.pos_tag(nltk.word_tokenize(sent))))
    tagged_corpus.append(temp)

In [14]:
print(tagged_corpus[0])

[[(u'Whaaaa', 'NNP'), (u'...', ':'), (u'.', '.'), (u'Metro', 'NNP'), (u"'s", 'POS'), (u'shiny', 'JJ'), (u'new', 'JJ'), (u'silver', 'NN'), (u'cars', 'NNS'), (u'have', 'VBP'), (u'seats', 'NNS'), (u'with', 'IN'), (u'lumbar', 'JJ'), (u'support', 'NN'), (u'.', '.')], [(u'This', 'DT'), (u'is', 'VBZ'), (u'thrilling', 'VBG'), (u'.', '.')], [(u'@', 'NN'), (u'wmata', 'NNS'), (u'thanks', 'NNS'), (u'for', 'IN'), (u'having', 'VBG'), (u'my', 'PRP$'), (u'back', 'NN')]]


# Add emoticon preprocessing with regex

In [15]:
import re
 
emoticons_str = r"""
    (?:
        [:=;] # Eyes
        [oO\-]? # Nose (optional)
        [D\)\]\(\]/\\OpP] # Mouth
    )"""
 
regex_str = [
    emoticons_str,
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
 
    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
    r'(?:[\w_]+)', # other words
    r'(?:\S)' # anything else
]
    
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
 
def tokenize(s):
    return tokens_re.findall(s)
 
def preprocess(s, lowercase=False):
    tokens = tokenize(s)
    if lowercase:
        tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
    return tokens

In [16]:
with open(PATH, 'r') as f:
    for line in f:
        tweet = json.loads(line)
        tokens = preprocess(tweet['text'])
        print(tokens)

[u'Whaaaa', u'.', u'.', u'.', u'.', u"Metro's", u'shiny', u'new', u'silver', u'cars', u'have', u'seats', u'with', u'lumbar', u'support', u'.', u'This', u'is', u'thrilling', u'.', u'@wmata', u'thanks', u'for', u'having', u'my', u'back']
[u'RT', u'@unsuckdcmetro', u':', u'Weihle-Reston', u'station', u'this', u'morning', u'.', u'Operators', u'are', u'doing', u'this', u'.', u'#wmata', u'https://t.co/QASKjIaZIP']
[u'crazy', u'how', u'i', u'know', u'the', u'MTA', u'better', u'than', u'WMATA', u'.']
[u'RT', u'@TriathlonBadBoy', u':', u"I'm", u'dressed', u'&', u'amp', u';', u'ready', u'to', u'take', u'WMATA', u'train', u'this', u'afternoon', u'into', u'DC', u'@FixWMATA', u'@Pat_Lewis360', u'@DeguelloBBQ', u'#wmata', u'#dcmetro', u'ht', u'\u2026']
[u'crazy', u'how', u'i', u'know', u'the', u'@MTA', u'better', u'than', u'@wmata', u'\ud83d', u'\ude43']
[u'Yes', u',', u'he', u'solved', u'it', u'fast', u'.', u'MT', u'@SRRas', u':', u'@wmata', u'Kudos', u'to', u'driver', u'of', u'train', u'with', u'm