In [63]:
import json
import os
import pickle
import re
import string
import itertools
import datetime
import time
import pytz
from collections import Counter
from tqdm import tqdm
from nltk import bigrams as nltk_bigrams
from nltk.stem.snowball import SnowballStemmer
from __future__ import print_function
from sklearn.feature_extraction import text
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd

plt.style.use('dark_background')

In [163]:
def read_data(filename,num_tweets):
    data = []
    tweet_time = []
    with open(filename,'r') as fp:
        for index,line in tqdm(enumerate(fp),total=num_tweets):
            entry = json.loads(line)
            data.append(entry['title'])
            tweet_time.append(datetime.datetime.fromtimestamp(entry['firstpost_date']))
    return data,tweet_time

def preprocessing(tweets):
    snowball_stemmer = SnowballStemmer("english")
    processed_tweets = []
    for tweet in tweets:
        tweet = re.sub('[-.,></?;:(){}!$%^&*_=~`]', ' ', tweet)
        tweet = ''.join(ch for ch in tweet if ch not in string.punctuation)
        tweet = ''.join(ch for ch in tweet if ord(ch) < 128)  # remove non-ascii characters
        words = [word for word in tweet.lower().split() \
                           if word not in text.ENGLISH_STOP_WORDS]
        processed_tweets.append(' '.join(tweet))
    return processed_tweets

def get_data(precomputed=False):
    csv_filename = os.path.join('data','train','part3','tweets.csv')
    if(not precomputed):
        if not os.path.exists(os.path.join('data','train','part3')):
            os.makedirs(os.path.join('data','train','part3'))
        filenames = {
            #'tweets_#gohawks' : 188136,
            #'tweets_#nfl' : 259024,
            #'tweets_#sb49' : 826951,
            #'tweets_#gopatriots' : 26232,
            #'tweets_#patriots' : 489713,
            'tweets_#superbowl' : 1348767
        }
        tweet=[]
        tweet_time=[]
        for key,num in filenames.items():
            print('Loading',key,'....')
            sub_tweets, sub_time = read_data(os.path.join('data','train',key+'.txt'),num)
            tweet += sub_tweets
            tweet_time += sub_time
        dataset = pd.DataFrame({'tweet_time':tweet_time,'tweet':tweet})
        dataset.to_csv(csv_filename,encoding='utf-8',index=False)
    else:
        dataset = pd.read_csv(csv_filename,encoding='utf-8',engine='python')
        dataset['tweet_time'].replace(u'', np.nan, inplace=True)
        dataset.dropna(subset=['tweet_time'], inplace=True)
        dataset['tweet_time'] = dataset['tweet_time'].map(lambda x: datetime.datetime.strptime(str(x),"%Y-%m-%d %H:%M:%S"))
    return dataset

In [164]:
df = get_data(precomputed=True)
print('Number of tweets =',len(df))

Loading tweets_#superbowl ....


100%|███████████████████████████████████████████████| 1348767/1348767 [10:39<00:00, 2110.53it/s]


Number of tweets = 1348767


In [165]:
int_period_start = datetime.datetime(2015,2,1,14,0,0)
int_period_end = datetime.datetime(2015,2,1,20,0,0)
df = df[df.tweet_time.apply(lambda x : x > int_period_start)]
df = df[df.tweet_time.apply(lambda x : x < int_period_end)]

In [166]:
grouped_data = df.set_index('tweet_time').groupby(pd.TimeGrouper(freq='10Min'))

In [167]:
def preprocess_data(group_tweets):
    all_hashtags = Counter()
    bigrams_counter = Counter()
    all_words = Counter() # words without #hashtags and @mentions
    hash_tag_str = re.compile(r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)")
    for tweet in group_tweets:
        hashtags = map(lambda x : str(x.lower()),hash_tag_str.findall(tweet))
        all_hashtags.update(hashtags)
        tweet = re.sub('[,.-:/()?{}*$&]', ' ', tweet)
        tweet = ''.join(ch for ch in tweet if ch not in string.punctuation)
        tweet = ''.join(ch for ch in tweet if ord(ch) < 128)  # remove non-ascii characters
        words = [str(word) for word in tweet.lower().split() \
                           if word not in text.ENGLISH_STOP_WORDS]

        regular_words = [w for w in words if not w.startswith(('#', '@'))]
        all_words.update(regular_words)
        bigrams_counter.update(nltk_bigrams(regular_words))

    return all_hashtags, all_words, bigrams_counter

#### Trending Ads

In [186]:
class Advertisement():
    def __init__(self,company,taglines,bigram_name=''):
        self.company = company
        self.name = re.sub('[- \']','',company).lower()
        self.taglines = self.filterTaglines(taglines)
        self.bigram_name = bigram_name.lower()
        self.count=0
        self.timeframe_count = []
        
    def filterTaglines(self,taglines):
        filtered_tag = []
        for tag in taglines:
            tag = re.sub("[-.,\'></?;:#(){}!$%^&*_=~ `]", '', tag)
            tag = ''.join(ch for ch in tag if ch not in string.punctuation)
            tag = '#'+tag
            filtered_tag.append(tag.lower())
        return filtered_tag
    
    def addTagLine(self,tagline):
        self.taglines.append(tagline)
        
    def updateCount(self,value):
        self.count+=value
    
    def updateTimeFrameCount(self):
        self.timeframe_count.append(self.count)
        self.count = 0

def create_adclasses():
    tmobile = Advertisement('T-Mobile',['One Upped','#Kim\'s Data Stash'])
    budweiser = Advertisement('Budweiser',['Lost Dog','Brewed the Hard Way'])
    bmw = Advertisement('BMW',['Newfangled Idea'])
    cocacola = Advertisement('Coca Cola',['Make It Happy'],'Coca Cola')
    doritos = Advertisement('Doritos',['When Pigs Fly','Middle Seat'])
    esurance = Advertisement('Esurance',['Sorta Pharmacist'])
    loctite = Advertisement('Loctite',['Positive Feelings'])
    mcd = Advertisement('McDonald\'s',['Pay With Lovin'],'McDonald s')
    snickers = Advertisement('Snickers',['Very Brady'])
    toyota = Advertisement('Toyota',['How Great I Am','My Great Dad'])
    victoria = Advertisement('Victoria\'s Secret',['Let the Real Games Begin'],'Victoria s Secret')
    return [\
            tmobile,budweiser,\
            bmw,cocacola,\
            doritos,esurance,\
            loctite,mcd,\
            snickers,supercell,\
            toyota,victoria\
           ]

def get_ad_information(companies,hashtags,keywords,bigram_words):
    total_ad_count = 0
    for ad in companies:
        for count, word in enumerate(keywords.keys()):
            if word == ad.name:
                count = keywords.get(word)
                ad.updateCount(count)
                total_ad_count += count

        for count, htag in enumerate(hashtags.keys()):
            if ad.name in htag or htag == ad.taglines:
                count = hashtags.get(htag)
                ad.updateCount(count)
                total_ad_count += count

        for count, bg_pair in enumerate(bigram_words.keys()):
            bg_word = ' '.join(x for x in bg_pair)
            if bg_word in ad.bigram_name:
                count = bigram_words.get(bg_pair)
                ad.updateCount(count)
                total_ad_count += count
        ad.updateTimeFrameCount()
    return total_ad_count

In [182]:
companies = create_adclasses()
count=0
for i, group in grouped_data:
    #print(group.tweet)
    hashtags,keywords,bigram_words = preprocess_data(group.tweet)
    #break
    ads_count = get_ad_information(companies,hashtags,keywords,bigram_words)
    count+=1
    print(ads_count)
    if(count==10):
        break
    #celeb_df = get_celebrities(hashtags,keywords)

15
24
11
28
29
43
87
87
259
804


In [185]:
for ad in companies:
    print(ad.bigram_name,ad.timeframe_count)

 [0, 2, 0, 0, 1, 0, 0, 0, 11, 7]
 [3, 7, 5, 10, 13, 14, 38, 30, 29, 22]
 [2, 2, 0, 0, 7, 6, 8, 5, 6, 0]
Coca Cola [0, 0, 0, 4, 0, 1, 3, 11, 7, 8]
 [4, 8, 4, 8, 8, 13, 15, 21, 29, 35]
 [0, 0, 0, 0, 0, 1, 1, 0, 17, 401]
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
McDonald s [4, 0, 0, 2, 0, 2, 7, 10, 148, 24]
 [1, 0, 0, 4, 0, 6, 6, 1, 7, 8]
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
 [0, 1, 0, 0, 0, 0, 3, 7, 3, 293]
Victoria s Secret [1, 4, 2, 0, 0, 0, 6, 2, 2, 6]


#### Trending Players

#### Trending Celebrities

In [62]:
celeb_first = ["John", "Idina", "Katy", "Lenny", "Missy", "Nina", "Josh"]
celeb_last = ["Legend", "Menzel", "Perry", "Kravitz", "Elliott", "Dobrev", "Duhamel"]

def get_celebrities(hash_tags, key_words):
    local_celeb_count = 0
    celeb_count = np.zeros(len(celeb_first))

    for count, tweet in enumerate(key_words.keys()):
        for i in range(len(celeb_count)):
            if tweet.find(celeb_first[i].lower()) > -1 or tweet.find(celeb_last[i].lower()) > -1:
                celeb_count[i] += key_words.get(tweet)
                local_celeb_count += key_words.get(tweet)

    for count, tweet in enumerate(hash_tags.keys()):
        for i in range(len(celeb_count)):
            if tweet.find(celeb_first[i].lower()) > -1 or tweet.find(celeb_last[i].lower()) > -1:
                celeb_count[i] += hash_tags.get(tweet)
                local_celeb_count += hash_tags.get(tweet)

    hour_status.append(celeb_count)
    return local_celeb_count