In [82]:
import json
import os
import pickle
import re
import string
import itertools
import datetime
import time
import pytz
from collections import Counter
from tqdm import tqdm
from nltk import bigrams as nltk_bigrams
from nltk.stem.snowball import SnowballStemmer
from __future__ import print_function
from sklearn.feature_extraction import text
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import plotly.plotly as py
import plotly.graph_objs as go

AttributeError: 'module' object has no attribute 'offline'

In [29]:
def read_data(filename,num_tweets):
    data = []
    tweet_time = []
    with open(filename,'r') as fp:
        for index,line in tqdm(enumerate(fp),total=num_tweets):
            entry = json.loads(line)
            data.append(entry['title'])
            tweet_time.append(datetime.datetime.fromtimestamp(entry['firstpost_date']))
    return data,tweet_time

def preprocessing(tweets):
    snowball_stemmer = SnowballStemmer("english")
    processed_tweets = []
    for tweet in tweets:
        tweet = re.sub('[-.,></?;:(){}!$%^&*_=~`]', ' ', tweet)
        tweet = ''.join(ch for ch in tweet if ch not in string.punctuation)
        tweet = ''.join(ch for ch in tweet if ord(ch) < 128)  # remove non-ascii characters
        words = [word for word in tweet.lower().split() \
                           if word not in text.ENGLISH_STOP_WORDS]
        processed_tweets.append(' '.join(tweet))
    return processed_tweets

def get_data(precomputed=False):
    csv_filename = os.path.join('data','train','part3','tweets.csv')
    if(not precomputed):
        if not os.path.exists(os.path.join('data','train','part3')):
            os.makedirs(os.path.join('data','train','part3'))
        filenames = {
            'tweets_#gohawks' : 188136,
            'tweets_#nfl' : 259024,
            'tweets_#sb49' : 826951,
            'tweets_#gopatriots' : 26232,
            'tweets_#patriots' : 489713,
            'tweets_#superbowl' : 1348767
        }
        tweet=[]
        tweet_time=[]
        for key,num in filenames.items():
            print('Loading',key,'....')
            sub_tweets, sub_time = read_data(os.path.join('data','train',key+'.txt'),num)
            tweet += sub_tweets
            tweet_time += sub_time
        dataset = pd.DataFrame({'tweet_time':tweet_time,'tweet':tweet})
        dataset.to_csv(csv_filename,encoding='utf-8',index=False)
    else:
        dataset = pd.read_csv(csv_filename,encoding='utf-8',engine='python')
        dataset['tweet_time'].replace(u'', np.nan, inplace=True)
        dataset.dropna(subset=['tweet_time'], inplace=True)
        dataset['tweet_time'] = dataset['tweet_time'].map(lambda x: datetime.datetime.strptime(str(x),"%Y-%m-%d %H:%M:%S"))
    return dataset

In [30]:
df = get_data(precomputed=False)
print('Number of tweets =',len(df))

Loading tweets_#patriots ....


100%|█████████████████████████████████████████████████| 489713/489713 [04:08<00:00, 1971.66it/s]


Loading tweets_#gopatriots ....


100%|███████████████████████████████████████████████████| 26232/26232 [00:13<00:00, 2007.96it/s]


Loading tweets_#superbowl ....


100%|███████████████████████████████████████████████| 1348767/1348767 [10:55<00:00, 2057.19it/s]


Loading tweets_#nfl ....


100%|█████████████████████████████████████████████████| 259024/259024 [02:48<00:00, 1532.76it/s]


Loading tweets_#gohawks ....


100%|█████████████████████████████████████████████████| 188136/188136 [01:35<00:00, 1967.17it/s]


Loading tweets_#sb49 ....


100%|█████████████████████████████████████████████████| 826951/826951 [07:04<00:00, 1947.28it/s]


Number of tweets = 3138823


In [31]:
int_period_start = datetime.datetime(2015,2,1,14,0,0)
int_period_end = datetime.datetime(2015,2,1,21,0,0)
df = df[df.tweet_time.apply(lambda x : x > int_period_start)]
df = df[df.tweet_time.apply(lambda x : x < int_period_end)]

In [32]:
grouped_data = df.set_index('tweet_time').groupby(pd.TimeGrouper(freq='15Min'))

In [33]:
def preprocess_data(group_tweets):
    all_hashtags = Counter()
    bigrams_counter = Counter()
    all_words = Counter() # words without #hashtags and @mentions
    hash_tag_str = re.compile(r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)")
    for tweet in group_tweets:
        hashtags = map(lambda x : str(x.lower()),hash_tag_str.findall(tweet))
        all_hashtags.update(hashtags)
        tweet = re.sub('[,.-:/()?{}*$&]', ' ', tweet)
        tweet = ''.join(ch for ch in tweet if ch not in string.punctuation)
        tweet = ''.join(ch for ch in tweet if ord(ch) < 128)  # remove non-ascii characters
        words = [str(word) for word in tweet.lower().split() \
                           if word not in text.ENGLISH_STOP_WORDS]

        regular_words = [w for w in words if not w.startswith(('#', '@'))]
        all_words.update(regular_words)
        bigrams_counter.update(nltk_bigrams(regular_words))

    return all_hashtags, all_words, bigrams_counter

#### Trending Ads

In [149]:
class Advertisement():
    def __init__(self,company,taglines,bigram_name=''):
        self.name = company
        self.text = re.sub('[- \']','',company).lower()
        self.taglines = self.filterTaglines(taglines)
        self.bigram_name = bigram_name.lower()
        self.count=0
        self.timeframe_count = []
        
    def filterTaglines(self,taglines):
        filtered_tag = []
        for tag in taglines:
            tag = re.sub("[-.,\'></?;:#(){}!$%^&*_=~ `]", '', tag)
            tag = ''.join(ch for ch in tag if ch not in string.punctuation)
            tag = '#'+tag
            filtered_tag.append(tag.lower())
        return filtered_tag
    
    def addTagLine(self,tagline):
        self.taglines.append(tagline)
        
    def updateCount(self,value):
        self.count+=value
    
    def updateTimeFrameCount(self):
        self.timeframe_count.append(self.count)
        self.count = 0

def create_adclasses():
    tmobile = Advertisement('T-Mobile',['One Upped','#Kim\'s Data Stash'])
    budweiser = Advertisement('Budweiser',['Lost Dog','Brewed the Hard Way'])
    bmw = Advertisement('BMW',['Newfangled Idea'])
    cocacola = Advertisement('Coca Cola',['Make It Happy'],'Coca Cola')
    doritos = Advertisement('Doritos',['When Pigs Fly','Middle Seat'])
    esurance = Advertisement('Esurance',['Sorta Pharmacist'])
    loctite = Advertisement('Loctite',['Positive Feelings'])
    mcd = Advertisement('McDonald\'s',['Pay With Lovin'],'McDonald s')
    snickers = Advertisement('Snickers',['Very Brady'])
    toyota = Advertisement('Toyota',['How Great I Am','My Great Dad'])
    return [\
            tmobile,budweiser,\
            bmw,cocacola,\
            doritos,esurance,\
            loctite,mcd,\
            snickers,toyota\
           ]

class Person():
    def __init__(self,name):
        self.name = name
        self.fname = name.split()[0].lower()
        self.lname = name.split()[1].lower()
        self.count=0
        self.timeframe_count = []

    def updateCount(self,value):
        self.count+=value
    
    def updateTimeFrameCount(self):
        self.timeframe_count.append(self.count)
        self.count = 0
        
def create_celebclasses():
    celebs = ['Mark Wahlberg','Katy Perry','Lenny Kravitz',
              'Missy Elliott','Idina Menzel','Kevin Hart']
    classes = []
    for p in celebs:
        classes.append(Person(p))

    return classes

def create_hawksclasses():
    players = ['Russell Wilson','Jermaine Kearse','Marshawn Lynch',
              'Steven Hauschka','KJ Wright','Bobby Wagner',
               'Ricardo Lockette','Earl Thomas','Kam Chancellor',
               'Chris Matthews']
    classes = []
    for p in players:
        classes.append(Person(p))

    return classes

def create_patriotsclasses():
    players = ['Tom Brady','Rob Gronkowski','Brandon LaFell',
               'Julian Edelman','Shane Vereen','Stephen Gostkowski',
               'Danny Amendola','Malcolm Butler','Jamie Collins',
               'LeGarrette Blount']
    classes = []
    for p in players:
        classes.append(Person(p))

    return classes

def get_ad_information(companies,hashtags,keywords,bigram_words):
    total_ad_count = 0
    for ad in companies:
        for count, word in enumerate(keywords.keys()):
            if word == ad.text:
                count = keywords.get(word)
                ad.updateCount(count)
                total_ad_count += count

        for count, htag in enumerate(hashtags.keys()):
            if ad.text in htag or htag == ad.taglines:
                count = hashtags.get(htag)
                ad.updateCount(count)
                total_ad_count += count

        for count, bg_pair in enumerate(bigram_words.keys()):
            bg_word = ' '.join(x for x in bg_pair)
            try:
                if bg_word in ad.bigram_name:
                    count = bigram_words.get(bg_pair)
                    ad.updateCount(count)
                    total_ad_count += count
            except:
                print("BG_WORD",bg_word)
        ad.updateTimeFrameCount()
    return total_ad_count

def get_person_information(persons, hash_tags, key_words):
    total_person_count = 0
    for p in persons:
        for count, word in enumerate(key_words.keys()):
            if word == p.fname or word == p.lname:
                count = keywords.get(word)
                p.updateCount(count)
                total_person_count += count
                
        for count, htag in enumerate(hash_tags.keys()):
            #Checking like this because lname can be Brady, and htag can be #BradyIsKillingIt
            if p.fname in htag or p.lname in htag:
                count = hash_tags.get(htag)
                p.updateCount(count)
                total_person_count += count        
        p.updateTimeFrameCount()
    return total_person_count

def get_timeseries(entities,timestamps,normalize):
    length = len(entities[0].timeframe_count)
    ads = pd.DataFrame({'time':timestamps[:length]})
    for entity in entities:
        ads[entity.name] = entity.timeframe_count
    ads = ads.set_index('time')
    if(normalize=='columns'):
        ads = (ads-ads.mean())/(ads.max()-ads.min())  # Normalizing the columns
        ads[ads<0] = 0
    if(normalize=='rows'):
        ads = ads.div(ads.sum(axis=1), axis=0)
        ads[ads<0] = 0
    match_data = dict(ads) 
    all_matches = pd.DataFrame(match_data)
    all_matches[all_matches < 0] = 0
    return all_matches

def plot_graph(results,filename):
    lines = []
    for name in results.columns:
        trace = go.Scatter(
            y = results[name],
            x = result.index,
            mode = 'lines+markers',
            name = name
        )
        lines.append(trace)
    layout = go.Layout(
        xaxis=dict(title='Time'),
        yaxis=dict(title='Tweets')
    )
    fig = go.Figure(data=lines, layout=layout)
    py.iplot(fig, filename=filename)
    py.image.save_as(fig, filename=os.path.join('graphs',filename+'.png'))

In [150]:
companies = create_adclasses()
celebs = create_celebclasses()
seahawks = create_hawksclasses()
patriots = create_patriotsclasses()
for i, group in sorted(grouped_data):
    print(i)
    hashtags,keywords,bigram_words = preprocess_data(group.tweet)
    ads_count = get_ad_information(companies,hashtags,keywords,bigram_words)
    celeb_count = get_person_information(celebs,hashtags,keywords)
    seahawks_count = get_person_information(seahawks,hashtags,keywords)
    patriots_count = get_person_information(patriots,hashtags,keywords)
    print(ads_count,celeb_count,seahawks_count,patriots_count)

2015-02-01 14:00:00
57 380 405 942
2015-02-01 14:15:00
48 354 438 883
2015-02-01 14:30:00
96 425 614 1271
2015-02-01 14:45:00
105 979 825 1631
2015-02-01 15:00:00
187 1655 1691 4661
2015-02-01 15:15:00
453 7648 1915 3936
2015-02-01 15:30:00
2159 2367 2887 4449
2015-02-01 15:45:00
5162 2426 2059 12391
2015-02-01 16:00:00
11888 2305 1826 10675
2015-02-01 16:15:00
6405 2782 2537 6380
2015-02-01 16:30:00
8685 4319 9260 2262
2015-02-01 16:45:00
1810 5259 3961 9347
2015-02-01 17:00:00
2117 31006 5128 2367
2015-02-01 17:15:00
467 100894 860 1013
2015-02-01 17:30:00
703 23014 9595 1516
2015-02-01 17:45:00
1433 7981 9212 7859
2015-02-01 18:00:00
2001 5259 2670 3456
2015-02-01 18:15:00
3124 4049 2439 8942
2015-02-01 18:30:00
2386 3121 1314 6846
2015-02-01 18:45:00
460 2156 5329 12335
2015-02-01 19:00:00
227 1438 6336 14641
2015-02-01 19:15:00
234 1542 2647 10697
2015-02-01 19:30:00
159 1423 1583 5753
2015-02-01 19:45:00
76 679 722 2447


#### Trending Ads

In [151]:
result = get_timeseries(companies,sorted(grouped_data.groups.keys()),normalize=None)
plot_graph(result,'ad-all')
result2 = get_timeseries(companies,sorted(grouped_data.groups.keys()),normalize='columns')
plot_graph(result2,'ad-normalized-column')
result3 = get_timeseries(companies,sorted(grouped_data.groups.keys()),normalize='rows')
plot_graph(result3,'ad-normalized-rows')

#### Trending Celebrities

In [152]:
result = get_timeseries(celebs,sorted(grouped_data.groups.keys()),normalize=None)
plot_graph(result,'celebs-all')
result2 = get_timeseries(celebs,sorted(grouped_data.groups.keys()),normalize='columns')
plot_graph(result2,'celebs-normalized-column')
result3 = get_timeseries(celebs,sorted(grouped_data.groups.keys()),normalize='rows')
plot_graph(result3,'celebs-normalized-rows')

#### Trending Players - Seattle Seahawks

In [153]:
result = get_timeseries(seahawks,sorted(grouped_data.groups.keys()),normalize=None)
plot_graph(result,'seahawks-all')
result2 = get_timeseries(seahawks,sorted(grouped_data.groups.keys()),normalize='columns')
plot_graph(result2,'seahawks-normalized-column')
result3 = get_timeseries(seahawks,sorted(grouped_data.groups.keys()),normalize='rows')
plot_graph(result3,'seahawks-normalized-rows')

#### Trending Players - New England Patriots

In [154]:
result = get_timeseries(patriots,sorted(grouped_data.groups.keys()),normalize=None)
plot_graph(result,'patriots-all')
result2 = get_timeseries(patriots,sorted(grouped_data.groups.keys()),normalize='columns')
plot_graph(result2,'patriots-normalized-column')
result3 = get_timeseries(patriots,sorted(grouped_data.groups.keys()),normalize='rows')
plot_graph(result3,'patriots-normalized-rows')