### Work In Progress - Extracting phrases around craft beer tweets

In [1]:
import tweepy

consumer_key = ''
consumer_secret = ''
access_token = ''
access_token_secret = ''

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, wait_on_rate_limit=True)

Example of what the raw text from the tweets looks like

In [2]:
searched_tweets = [tweet for tweet in tweepy.Cursor(api.search, q='craftbeer', lang='en').items(10)]
orig_tweets = [w.text for w in searched_tweets]
orig_tweets

['2014 Fluxus in 2018. Yummmmm\n\n#allagashbrewing #instabeer #craftbeer #craftbeerporn #ilovebeer… https://t.co/54hjaaJsVM',
 'Registration is OPEN! Announces a Collab with.',
 'VolleyBristol "The #PremierLeague #Football just keeps on coming. Today why not have a #VolleyRoast, a pint of… https://t.co/tEVJI7PP2b',
 'The #PremierLeague #Football just keeps on coming. Today why not have a #VolleyRoast, a pint of #craftbeer and watc… https://t.co/lozrHVtNg5',
 'RT @BBBrandsUK: With probably the last keg in the UK head over to @taproomse18 to give @phbrew #bountyhunter delicious Coconut Chocolate Ni…',
 'The #PremierLeague #Football just keeps on coming. Today why not have a #VolleyRoast, a pint of #craftbeer and watc… https://t.co/va9sf7go9Y',
 'RT @TheShabbycats: #acoustic duo #TheShabbycats playing #livemusic @TheGroveHudds #Huddersfield today 4-7pm #HuddersfieldIs #realale #Craft…',
 'Check out my latest blog post https://t.co/ovplv4OHIt #tinyrebel @tinyrebelbrewco #craftbeer\xa0#dogf

Function to get cleaned up text

1. Searches for the tweet query and returns the desired number of results
2. Grabs only the text from the tweet
3. Makes all the text lowercase
4. Removes hyperlinks
5. Removes usernames and @ mentions
6. Removes hashtags
7. Removes punctuation and misc. characters
8. Removes extra letters in a word (i.e happpyyyy to happy)
9. Removes emojis
10. Tokenizes the words
11. lemmatization of the words
12. Removes single characters
13. Removes stop words

In [3]:
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import re
#nltk.download('stopwords')
#nltk.download('wordnet')

def get_clean_tweets(query, max_tweets):
    searched_tweets = [tweet for tweet in tweepy.Cursor(api.search, q=query, lang='en').items(max_tweets)]
    clean_tweets = []   
    for tweet in searched_tweets:
        tweet_text = tweet.text
        lower = tweet_text.lower()
        link = re.sub(r'https\S+', ' ', lower)
        un = re.sub(r"(?:\@)\S+", "", link)
        hash_tag = re.sub(r"(?:\#)\S+", "", un)
        punc = re.sub(r"[,.:;'~‘\"\#\@\|’“”%-?!&$]+\ *", ' ', hash_tag)
        misc = re.sub('rt|…|amp', ' ', punc)
        pattern = re.compile(r"(.)\1{2,}")
        reduce_leng = pattern.sub(r"\1\1", misc)
        emoji_pattern = re.compile("["u"\U0001F600-\U0001F64F" u"\U0001F300-\U0001F5FF" u"\U0001F680-\U0001F6FF" u"\U0001F1E0-\U0001F1FF"  
                           u"\U00002702-\U000027B0" u"\U000024C2-\U0001F251" u"\U0001F900-\U0001F999" "]+", flags=re.UNICODE)
        emoji = emoji_pattern.sub(r'', reduce_leng)
        tokenizer = TweetTokenizer()
        tokens = tokenizer.tokenize(emoji)
        lem = WordNetLemmatizer()
        lemword = [lem.lemmatize(w) for w in tokens]
        single_char = [w for w in lemword if len(w) > 1]
        stop_words = [w for w in single_char if not w in stopwords.words('english')]
        clean_tweets.append(stop_words)
    return clean_tweets

In [None]:
data = get_clean_tweets('craftbeer', 2000)

Examples of the cleaned up text

In [38]:
data[:5]

[['result',
  'main',
  'advantage',
  'ibeernetwork',
  'flexibility',
  'simple',
  'screen'],
 ['criterion'],
 ['depa',
  'ing',
  'great',
  'little',
  'haul',
  'local',
  'fun',
  'imbibing',
  'ahead'],
 ['duo', 'playing', 'today', 'pm'],
 ['spice', 'week', 'tuesday', 'curry', 'night']]

Which single words appear the most

In [35]:
import collections

flat_list = [item for sublist in data for item in sublist]
counter = collections.Counter(flat_list)
sorted_words = sorted(counter.items(), key=lambda x:x[1], reverse=True)
sorted_words[:20]

[('beer', 500),
 ('craft', 155),
 ('day', 136),
 ('today', 131),
 ('ale', 121),
 ('great', 119),
 ('brewery', 112),
 ('ipa', 112),
 ('time', 101),
 ('get', 99),
 ('new', 95),
 ('one', 86),
 ('pm', 84),
 ('come', 82),
 ('brewing', 81),
 ('tap', 77),
 ('good', 63),
 ('weekend', 62),
 ('brew', 57),
 ('saturday', 56)]

Most common bigrams

In [36]:
from nltk import ngrams

bigrams = ngrams(flat_list, 2)
fdist = nltk.FreqDist(bigrams)
sorted_bi = sorted(fdist.items(), key=lambda x:x[1], reverse=True)
sorted_bi[:10]

[(('craft', 'beer'), 109),
 (('po', 'er'), 25),
 (('pale', 'ale'), 24),
 (('barrel', 'aged'), 24),
 (('bourbon', 'barrel'), 23),
 (('time', 'another'), 21),
 (('pm', 'pm'), 21),
 (('beer', 'festival'), 19),
 (('ex', 'le'), 18),
 (('enjoying', 'bbq'), 17)]

Extracting phrases or words that apear together frequently

In [37]:
from gensim.models import Phrases
import warnings
warnings.filterwarnings('ignore')

bigram_model = Phrases(data)
trigram_model = Phrases(bigram_model[data])
trigram_model = list(trigram_model[bigram_model[data]])

Looking at the 20 most common phrases

In [33]:
import collections

phrase_list = []

for item in trigram_model:
    for word in item:
        if '_' in word:
            phrase_list.append(word)

counter = collections.Counter(phrase_list)
phrase_freq = sorted(counter.items(), key=lambda x:x[1], reverse=True)
phrase_freq[:20]

[('craft_beer', 76),
 ('po_er', 25),
 ('pale_ale', 24),
 ('enjoying_bbq_friend_home', 17),
 ('along_sour_favourite_style', 16),
 ('bourbon_barrel_aged_great', 16),
 ('ex_le_style', 16),
 ('pm_pm', 16),
 ('definitely_time_another_cracker', 14),
 ('craft_brewery', 13),
 ('earned_one_best_around', 13),
 ('brewing_co', 12),
 ('double_paired', 12),
 ('perfect_day', 12),
 ('live_music', 11),
 ('brewing_company', 11),
 ('always_soft_spot_glad', 11),
 ('finally_get_hand_evil', 11),
 ('craft_beer_festival', 10),
 ('barrel_room', 10)]