In [1]:
# Load library
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import re

import matplotlib.pyplot as plt
%matplotlib inline  
from  matplotlib import style

from matplotlib import rcParams
rcParams.update({'figure.autolayout': True})

In [2]:
# read all six csv files
df_sea = pd.read_csv("tweets/seattle_tweets.csv", encoding="latin1")
df_sea = pd.DataFrame(df_sea)
df_sad = pd.read_csv("tweets/sandiego_tweets.csv", encoding="latin1")
df_sad = pd.DataFrame(df_sad)
df_den = pd.read_csv("tweets/denver_tweets.csv", encoding="latin1")
df_den = pd.DataFrame(df_den)
df_chi = pd.read_csv("tweets/chicago_tweets.csv", encoding="latin1")
df_chi = pd.DataFrame(df_chi)
df_por = pd.read_csv("tweets/portland_tweets.csv", encoding="latin1")
df_por = pd.DataFrame(df_por)
df_aus = pd.read_csv("tweets/austin_tweets.csv", encoding="latin1")
df_aus = pd.DataFrame(df_aus)

In [3]:
# add a column for each respective city of the top six
df_sea['city'] = "Seattle"
df_sad['city'] = "San Diego"
df_den['city'] = "Denver"
df_chi['city'] = "Chicago"
df_por['city'] = "Portland"
df_aus['city'] = "Austin"

In [4]:
# concat csv files 
frames_cities = df_sea,df_sad,df_den,df_chi,df_por,df_aus
df_tweets = pd.concat(frames_cities)

In [5]:
# inspect head of df
df_tweets.head()

Unnamed: 0,name,time,tweet,geo,id,source,city
0,1030995577981435906,2018-08-19 01:51:15,RT @flyingbikecoop: 3rd Anniversary Party toda...,,1030995577981435906,Twitter for iPhone,Seattle
1,1030962792415866880,2018-08-18 23:40:58,Mystery keg #1 is tapped! Pitchfork is our wil...,"{'type': 'Point', 'coordinates': [47.69208753,...",1030962792415866880,Instagram,Seattle
2,1030958589467406337,2018-08-18 23:24:16,3 years old!! #proudmemberowner - Drinking a S...,"{'type': 'Point', 'coordinates': [47.6921, -12...",1030958589467406337,Untappd,Seattle
3,1030953374148165633,2018-08-18 23:03:33,#MemberDrivenBeer celebrating 3 years! - Drink...,,1030953374148165633,Untappd,Seattle
4,1030944029788332032,2018-08-18 22:26:25,#MemberDrivenBeer Stout Float :-D - Drinking a...,,1030944029788332032,Untappd,Seattle


In [6]:
# inspect shape of df
df_tweets.shape

(6280, 7)

In [7]:
#import other python packages and libraries to do wordclouds, wordcounts, and remove stopwords
from os import path
from scipy.misc import imread
import random
from wordcloud import WordCloud, STOPWORDS

In [8]:
#import other python packages and libraries to do wordclouds, wordcounts, and remove stopwords
import nltk
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [9]:
# define a function that cleans the tweets
def preprocess(raw_text):

    # keep only words
    letters_only_text = re.sub("[^a-zA-Z]", " ", raw_text)
    
    # convert to lower case and split 
    words = letters_only_text.lower().split()

    # remove stopwords
    stopword_set = set(stopwords.words("english"))
    meaningful_words = [w for w in words if w not in stopword_set]

    # join the cleaned words in a list
    cleaned_word_list = " ".join(meaningful_words)

    return cleaned_word_list

In [10]:
# use apply function to apply the preprocess function
df_tweets['clean_tweet'] = df_tweets.tweet.apply(preprocess)

In [11]:
# use apply function to apply the len function
df_tweets['tweet_len'] = df_tweets.tweet.apply(len)

In [12]:
# use apply function to add a column which tells us if tweet contains RT
df_tweets['retweet'] = df_tweets.tweet.str.contains("RT")

In [13]:
# use apply function to add a column which tells us if tweet contains @ symbol
df_tweets['at_brewery'] = df_tweets.tweet.str.contains("@")

In [14]:
# use apply function to add a column which tells us if tweet contains the word drinking
df_tweets['drinking'] = df_tweets.clean_tweet.str.contains("drinking")

In [15]:
# drop geo and name columns 
df_tweets = df_tweets.drop(columns=['geo','name'])

In [16]:
# drop duplicate observations
df_tweets = df_tweets.drop_duplicates()

In [17]:
# print head 
df_tweets[0:200]

Unnamed: 0,time,tweet,id,source,city,clean_tweet,tweet_len,retweet,at_brewery,drinking
0,2018-08-19 01:51:15,RT @flyingbikecoop: 3rd Anniversary Party toda...,1030995577981435906,Twitter for iPhone,Seattle,rt flyingbikecoop rd anniversary party today b...,107,True,True,False
1,2018-08-18 23:40:58,Mystery keg #1 is tapped! Pitchfork is our wil...,1030962792415866880,Instagram,Seattle,mystery keg tapped pitchfork wild golden stron...,145,False,False,False
2,2018-08-18 23:24:16,3 years old!! #proudmemberowner - Drinking a S...,1030958589467406337,Untappd,Seattle,years old proudmemberowner drinking storm cycl...,130,False,True,True
3,2018-08-18 23:03:33,#MemberDrivenBeer celebrating 3 years! - Drink...,1030953374148165633,Untappd,Seattle,memberdrivenbeer celebrating years drinking cu...,134,False,True,True
4,2018-08-18 22:26:25,#MemberDrivenBeer Stout Float :-D - Drinking a...,1030944029788332032,Untappd,Seattle,memberdrivenbeer stout float drinking mocha mi...,133,False,True,True
5,2018-08-18 22:25:49,See you soon for our Third Birthday Party! \n4...,1030943879732899840,Instagram,Seattle,see soon third birthday party pm mystery keg t...,145,False,False,False
6,2018-08-18 21:21:14,#MemberDrivenBeer 3rd Anniversary - Drinking a...,1030927628293873664,Untappd,Seattle,memberdrivenbeer rd anniversary drinking ulmar...,140,False,True,True
7,2018-08-18 21:03:07,3rd Anniversary Party today! Bring all your fr...,1030923065188708352,Twitter for iPhone,Seattle,rd anniversary party today bring friends famil...,87,False,False,False
8,2018-08-17 18:22:05,Updated Beer List! Come have some fun with us ...,1030520154109698048,Instagram,Seattle,updated beer list come fun us tonight enjoy ra...,150,False,False,False
9,2018-08-17 17:19:23,Come have some fun with us tonight and enjoy t...,1030504376903688192,Instagram,Seattle,come fun us tonight enjoy rad infusions cooked...,150,False,False,False


In [22]:
# create function to find the word Drinking and returning the strings that follow and stoping at the word by
def findcraft(beer_text):
    drinking_word_pos = beer_text.find('Drinking a') 
    at_pos = beer_text.find('by')
    craft_beer_drinking = beer_text[drinking_word_pos:at_pos]
    return craft_beer_drinking

In [23]:
# apply function to df
df_tweets['craft_beer'] = df_tweets.tweet.apply(findcraft)

In [24]:
# print head to inspect
df_tweets.head()

Unnamed: 0,time,tweet,id,source,city,clean_tweet,tweet_len,retweet,at_brewery,drinking,craft_beer
0,2018-08-19 01:51:15,RT @flyingbikecoop: 3rd Anniversary Party toda...,1030995577981435906,Twitter for iPhone,Seattle,rt flyingbikecoop rd anniversary party today b...,107,True,True,False,
1,2018-08-18 23:40:58,Mystery keg #1 is tapped! Pitchfork is our wil...,1030962792415866880,Instagram,Seattle,mystery keg tapped pitchfork wild golden stron...,145,False,False,False,
2,2018-08-18 23:24:16,3 years old!! #proudmemberowner - Drinking a S...,1030958589467406337,Untappd,Seattle,years old proudmemberowner drinking storm cycl...,130,False,True,True,Drinking a Storm Cycle CDA
3,2018-08-18 23:03:33,#MemberDrivenBeer celebrating 3 years! - Drink...,1030953374148165633,Untappd,Seattle,memberdrivenbeer celebrating years drinking cu...,134,False,True,True,Drinking a Curry Porter
4,2018-08-18 22:26:25,#MemberDrivenBeer Stout Float :-D - Drinking a...,1030944029788332032,Untappd,Seattle,memberdrivenbeer stout float drinking mocha mi...,133,False,True,True,Drinking a Mocha Milk Stout


In [25]:
# replace the string Drinking a with empty space
df_tweets['craft_beer'] = df_tweets.craft_beer.str.replace('Drinking a', '')

In [26]:
# replace n with empty space
df_tweets['craft_beer'] = df_tweets.craft_beer.str.replace('n ', '')

In [27]:
# write a function to print out 'beer not found' if the word Drinking was not found
def replace_na(text):
    final_len = len(text) 
    no_beer = 0
    
    if final_len == no_beer:
        return "no beer found"
    else:
        return text

In [28]:
# apply function to df
df_tweets['craft_beer'] = df_tweets.craft_beer.apply(replace_na)

In [29]:
# inspect head of df
df_tweets.head()

Unnamed: 0,time,tweet,id,source,city,clean_tweet,tweet_len,retweet,at_brewery,drinking,craft_beer
0,2018-08-19 01:51:15,RT @flyingbikecoop: 3rd Anniversary Party toda...,1030995577981435906,Twitter for iPhone,Seattle,rt flyingbikecoop rd anniversary party today b...,107,True,True,False,no beer found
1,2018-08-18 23:40:58,Mystery keg #1 is tapped! Pitchfork is our wil...,1030962792415866880,Instagram,Seattle,mystery keg tapped pitchfork wild golden stron...,145,False,False,False,no beer found
2,2018-08-18 23:24:16,3 years old!! #proudmemberowner - Drinking a S...,1030958589467406337,Untappd,Seattle,years old proudmemberowner drinking storm cycl...,130,False,True,True,Storm Cycle CDA
3,2018-08-18 23:03:33,#MemberDrivenBeer celebrating 3 years! - Drink...,1030953374148165633,Untappd,Seattle,memberdrivenbeer celebrating years drinking cu...,134,False,True,True,Curry Porter
4,2018-08-18 22:26:25,#MemberDrivenBeer Stout Float :-D - Drinking a...,1030944029788332032,Untappd,Seattle,memberdrivenbeer stout float drinking mocha mi...,133,False,True,True,Mocha Milk Stout


In [31]:
# inspect shape of df
df_tweets.shape

(6018, 11)

In [30]:
## save to local 
#df_tweets.to_csv("df_tweets.csv", index=False)