In [1]:
import tweepy as tw
import pandas as pd
import geocoder
import json
import nltk
from nltk import bigrams
from nltk.corpus import stopwords
import itertools
import collections
import networkx as nx
from datetime import datetime
from textblob import TextBlob #For Sentiment Analysis
import matplotlib.pyplot as plt #For Graphing the Data
import seaborn as sns
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import re
import os

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', -1)  # or 199

1. get tweets in usa about pollen allergy, get for USA only
2. map the tweets location for positive/negative/neutral using plotly
3. create a timeline, timeseries analysis
4. host model on azure ml (use azure api)
5. host project on github with real time analysis

In [2]:
consumer_key = 'get keyconsumer_secret = 'get secret'
access_token = 'get token'
access_token_secret = 'get token secret'

In [3]:
auth = tw.OAuthHandler(consumer_key, consumer_secret) #Fill these in
auth.set_access_token(access_token, access_token_secret)  #Fill these in
api = tw.API(auth, wait_on_rate_limit=True)

In [4]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\spunna\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
search_word = 'allergy'
close_words =  [search_word, 'allergies', 'amp']
stop_words.update(close_words)

In [7]:
usa_latitude = 40    # geographical centre of search
usa_longitude = -100    # geographical centre of search
max_range = 1000           # search range in miles
t = tw.Cursor(api.search,
                    q='allergy -filter:retweets',
                    tweet_mode='extended',
                    geocode = "%f,%f,%dkm" % (usa_latitude, usa_longitude, max_range), 
                    lang="en").items(1)

In [10]:
def clean_tweet(tweet): 
        ''' 
        Utility function to clean tweet text by removing links, special characters 
        using simple regex statements. 
        '''
        return " ".join(re.sub("([^0-9A-Za-z \t])|(\w+:\/\/\S+)", "", tweet).split()).lower()

def clean_tweet_text(tweet): 
    return clean_tweet(tweet).lower()
    
def get_tweet_sentiment(tweet): 
    ''' 
    Utility function to classify sentiment of passed tweet using textblob's sentiment method 
    '''
    # create TextBlob object of passed tweet text 
    cleaned_tweet = clean_tweet(tweet)
    analysis = TextBlob(cleaned_tweet) 
    sentiment = 0
    # set sentiment 
    if analysis.sentiment.polarity > 0: 
        sentiment = 1
    elif analysis.sentiment.polarity == 0: 
        sentiment = 0
    else: 
        sentiment = -1
    return cleaned_tweet, sentiment

def get_location(location):
    cordinates = geocoder.arcgis(location)
    return cordinates.x, cordinates.y

def get_tweets(query, count = 10): 
    # remove retweets
    fetched_count = 0
    new_search = query + " -filter:retweets"
    tweets_columns = ['created_at', 'location', 'sentiment', 'text', 'lat', 'long']
    tweets_df = pd.DataFrame(columns = tweets_columns)
    try: 
        #TODO - MAXID
        while (fetched_count < count):
            fetched_tweets = tw.Cursor(api.search,
                       q=new_search,
                       geocode = "%f,%f,%dmi" % (usa_latitude, usa_longitude, max_range), 
                       tweet_mode='extended',
                       lang="en").items(count)
            
            for tweet in fetched_tweets:
                if (tweet.user.location != ''):
                    text, sentiment = get_tweet_sentiment(tweet.full_text)
                    long, lat = get_location(tweet.user.location)
                    tweets_df = tweets_df.append({'created_at':tweet.created_at, 
                                      'location':tweet.user.location,
                                      'sentiment':sentiment,
                                      'text':text,
                                      'lat':lat,
                                      'long':long},
                                     ignore_index=True)
                    fetched_count = fetched_count + 1
            print(fetched_count, tweets_df.shape, sentiment)
                 
       # tweets_df = fetched_count, pd.DataFrame(data = tweets, columns=tweets_columns)            
        return fetched_count, tweets_df
    except tweepy.TweepError as e: 
        print("Error : " + str(e)) 

In [11]:
%%time
fetched_count, tweets = get_tweets(query = '#allergy', count = 1000) 
if  ( fetched_count > 0):
    print('Total tweets fetched for allergy:', fetched_count)
    allergyTweets = tweets

192 (192, 6) 1
384 (384, 6) 1
576 (576, 6) 1
768 (768, 6) 1
960 (960, 6) 1
1152 (1152, 6) 1
Total tweets fetched for allergy: 1152
Wall time: 11min 27s


In [None]:
allergyTweets.head()

Common Words Found in Tweets (Without Stop or Collection Words)

check # of positive, negative and neutral tweets


In [None]:
plt.show();
## declare the variables for the pie chart, using the Counter variables for “sizes”
sentiments = pd.Series(data=allergyTweets['sentiment'].value_counts(), index=[-1,0,1])
labels = 'Positive', 'Negative', 'Neutral'
sizes = [sentiments[1], sentiments[-1], sentiments[0]]
colors = ['green', 'red', 'yellow']

fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%',shadow=False, startangle=90)
ax1.axis('equal')
plt.title('Sentiment of {} tweets about allergy'.format(fetched_count))
plt.show()

In [None]:
tweet_words = ' '.join(allergyTweets.text).split()
tweets_unsw = [[word for word in tweet_words if not word in stop_words]][0]
counts_nsw_nc= collections.Counter(tweets_unsw)
clean_tweets_ncw = pd.DataFrame(counts_nsw_nc.most_common(15),
                             columns=['words', 'count'])

fig, ax = plt.subplots(figsize=(6,5))
# Plot horizontal bar graph
clean_tweets_ncw.sort_values(by='count').plot.barh(x='words',
                      y='count',
                      ax=ax,
                      color="blue")
ax.set_title("Common words found in tweets out of total word count: {}".format(len(tweets_unsw)))
plt.show();

In [None]:
terms_bigram = list(bigrams(tweets_unsw))
bigram_counts = collections.Counter(terms_bigram)
bigram_df = pd.DataFrame(bigram_counts.most_common(20),
                             columns=['bigram', 'count'])


Visualizing network of bigrams

In [None]:
# Create dictionary of bigrams and their counts
d = bigram_df.set_index('bigram').T.to_dict('records')
# Create network plot 
G = nx.Graph()

# Create connections between nodes
for k, v in d[0].items():
    G.add_edge(k[0], k[1], weight=(v * 10))

G.add_node("allergy", weight=100)
fig, ax = plt.subplots(figsize=(10, 8))

pos = nx.spring_layout(G, k=1)

# Plot networks
nx.draw_networkx(G, pos,
                 font_size=16,
                 width=3,
                 edge_color='grey',
                 node_color='purple',
                 with_labels = False,
                 ax=ax)

# Create offset labels
for key, value in pos.items():
    x, y = value[0]+.135, value[1]+.045
    ax.text(x, y,
            s=key,
            bbox=dict(facecolor='red', alpha=0.25),
            horizontalalignment='center', fontsize=13)
    
plt.show()

In [None]:
def SetColor(x):
    if(x == 1):
        return "green"
    elif(x == 0):
        return "yellow"
    else:
        return "red"
    
data = [ go.Scattergeo(
        locationmode = 'USA-states',
        lon = allergyTweets['long'],
        lat = allergyTweets['lat'],
        text = allergyTweets['text'],
        mode = 'markers',
        marker = dict( 
            size = 8, 
            opacity = 0.8,
            symbol = 'circle',
            line = dict(
                width=1,
                color='rgba(102, 102, 102)'
            ),
            cmin = 0,
            color=list(map(SetColor, allergyTweets['sentiment']))
           
        ))]

layout = dict(
        title = 'Location of Twitter users who talked about allergy', 
        geo = dict(
            scope='usa',
            projection=dict( type='albers usa' ),
            showland = True,
            landcolor = "rgb(250, 250, 250)",
            subunitcolor = "rgb(217, 217, 217)",
            countrycolor = "rgb(217, 217, 217)",
            countrywidth = 0.5,
            subunitwidth = 0.5        
        ),
    )

fig = go.Figure(data=data, layout=layout )
py.iplot(fig, filename='usa-twitter-allergy' )

In [13]:
"""
Weather API Python sample code

Copyright 2019 Oath Inc. Licensed under the terms of the zLib license see https://opensource.org/licenses/Zlib for terms.

$ python --version
Python 2.7.10

"""
APP_ID="vWaHUN5c"
apikey="dj0yJmk9TTFjR0JXcEhCYmF4JnM9Y29uc3VtZXJzZWNyZXQmc3Y9MCZ4PWE0"
apisecret="d713f97b5ec5f2b4430cda140088f5a4e9cfde17"

import time, uuid, urllib, urllib3
import hmac, hashlib
from base64 import b64encode

"""
Basic info
"""
url = 'https://weather-ydn-yql.media.yahoo.com/forecastrss'
method = 'GET'
app_id = APP_ID
consumer_key = apikey
consumer_secret = apisecret
concat = '&'
query = {'location': 'sunnyvale,ca', 'format': 'json'}
oauth = {
    'oauth_consumer_key': consumer_key,
    'oauth_nonce': uuid.uuid4().hex,
    'oauth_signature_method': 'HMAC-SHA1',
    'oauth_timestamp': str(int(time.time())),
    'oauth_version': '1.0'
}

"""
Prepare signature string (merge all params and SORT them)
"""
merged_params = query.copy()
merged_params.update(oauth)
sorted_params = [k + '=' + urllib.quote(merged_params[k], safe='') for k in sorted(merged_params.keys())]
signature_base_str =  method + concat + urllib.quote(url, safe='') + concat + urllib.quote(concat.join(sorted_params), safe='')

"""
Generate signature
"""
composite_key = urllib.quote(consumer_secret, safe='') + concat
oauth_signature = b64encode(hmac.new(composite_key, signature_base_str, hashlib.sha1).digest())

"""
Prepare Authorization header
"""
oauth['oauth_signature'] = oauth_signature
auth_header = 'OAuth ' + ', '.join(['{}="{}"'.format(k,v) for k,v in oauth.iteritems()])

"""
Send request
"""
url = url + '?' + urllib.urlencode(query)
request = urllib3.Request(url)
request.add_header('Authorization', auth_header)
request.add_header('X-Yahoo-App-Id', app_id)
response = urllib3.urlopen(request).read()
print(response)

AttributeError: module 'urllib' has no attribute 'quote'

In [14]:
!pip install requests_oauthlib 



You are using pip version 18.1, however version 19.0.3 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [15]:
from requests_oauthlib import OAuth1Session


In [17]:
import time, uuid, urllib
from json import loads
import hmac, hashlib
from base64 import b64encode


def _generate_signature(key, data):
    key_bytes= bytes(key , 'utf-8')
    data_bytes = bytes(data, 'utf-8') 
    signature =  hmac.new(
        key_bytes,
        data_bytes,
        hashlib.sha1
    ).digest()
    return b64encode(signature).decode()



def get_yahoo_weather(
    location,
    url='https://weather-ydn-yql.media.yahoo.com/forecastrss'
):
    app_id = APP_ID
    consumer_key = apikey
    consumer_secret = apisecret
    # Basic info
    method = 'GET'
    concat = '&'
    query = {
        'location': location,
        'format': 'json'
    }
    oauth = {
        'oauth_consumer_key': consumer_key,
        'oauth_nonce': uuid.uuid4().hex,
        'oauth_signature_method': 'HMAC-SHA1',
        'oauth_timestamp': str(int(time.time())),
        'oauth_version': '1.0'
    }

    # Prepare signature string (merge all params and SORT them)
    merged_params = query.copy()
    merged_params.update(oauth)
    sorted_params = [
        k + '=' + urllib.parse.quote(merged_params[k], safe='')
        for k in sorted(merged_params.keys())
    ]
    signature_base_str = (
        method + 
        concat + 
        urllib.parse.quote(
            url,
            safe=''
        ) +
        concat + 
        urllib.parse.quote(concat.join(sorted_params), safe='')
    )

    # Generate signature
    composite_key = urllib.parse.quote(
        consumer_secret,
        safe=''
    ) + concat
    oauth_signature = _generate_signature(
        composite_key,
        signature_base_str
    )

    # Prepare Authorization header
    oauth['oauth_signature'] = oauth_signature
    auth_header = (
        'OAuth ' + 
        ', '.join(
            [
                '{}="{}"'.format(k,v) 
                for k,v in oauth.items()
            ]
        )
    )

    # Send request
    url = url + '?' + urllib.parse.urlencode(query)
    request = urllib.request.Request(url)
    request.add_header('Authorization', auth_header)
    request.add_header('X-Yahoo-App-Id', app_id)
    response = urllib.request.urlopen(request).read()
    return loads(response)

In [18]:
get_yahoo_weather('Raleigh, NC')

{'current_observation': {'astronomy': {'sunrise': '7:13 am',
   'sunset': '7:30 pm'},
  'atmosphere': {'humidity': 85,
   'pressure': 29.94,
   'rising': 0,
   'visibility': 10.0},
  'condition': {'code': 30, 'temperature': 34, 'text': 'Partly Cloudy'},
  'pubDate': 1553428800,
  'wind': {'chill': 34, 'direction': 0, 'speed': 0.62}},
 'forecasts': [{'code': 30,
   'date': 1553400000,
   'day': 'Sun',
   'high': 69,
   'low': 33,
   'text': 'Partly Cloudy'},
  {'code': 11,
   'date': 1553486400,
   'day': 'Mon',
   'high': 69,
   'low': 52,
   'text': 'Showers'},
  {'code': 11,
   'date': 1553572800,
   'day': 'Tue',
   'high': 55,
   'low': 40,
   'text': 'Showers'},
  {'code': 34,
   'date': 1553659200,
   'day': 'Wed',
   'high': 53,
   'low': 35,
   'text': 'Mostly Sunny'},
  {'code': 32,
   'date': 1553745600,
   'day': 'Thu',
   'high': 61,
   'low': 33,
   'text': 'Sunny'},
  {'code': 34,
   'date': 1553832000,
   'day': 'Fri',
   'high': 70,
   'low': 42,
   'text': 'Mostly Sunn