In [1]:
# necessary outside things
import csv
import numpy as np

from data_loader import Data_loader
option = 'word'
max_len = 20
vocab_size = 30000
dl = Data_loader(vocab_size=vocab_size, max_len=max_len, option=option)

from represent_tweet_level import TweetLevel
tl_w2v = TweetLevel(word_level='../data/w2v_word_s300_w5_mc5_it20.bin', wl_file_type='w2v')
tl_splex = TweetLevel(word_level='../data/splex_standard_svd_word_s300_seeds_hc.pkl', wl_file_type='pkl')


Loading vocabulary ...
30000 vocab is considered.
Loading tweets ...
Processing tweets ...
Data loader initialization finishes
Initializing TweetLevel...
Number of embeddings in ../data/w2v_word_s300_w5_mc5_it20.bin: 23417
Sample tweet_dict item: (740043438788345856, [2, 254, 440, 192, 94, 57, 72, 77])
Size of tweet_dict: 1033655
Initializing TweetLevel...
Number of embeddings in ../data/splex_standard_svd_word_s300_seeds_hc.pkl: 20000
Sample tweet_dict item: (740043438788345856, [2, 254, 440, 192, 94, 57, 72, 77])
Size of tweet_dict: 1033655


In [None]:
tweet_id = 905716028390481921
w_emb = tl_w2v.get_representation(tweet_id, mode='avg')
sp_emb =  tl_splex.get_representation(tweet_id, mode='avg')
full_emb = np.concatenate([w_emb, sp_emb])
print(len(full_emb))

In [56]:
class Contextifier:
    '''
        Creates the context for tweets.
    '''
    def __init__(self, context_size=1, context_combine='avg', use_rt_user=False, 
                 use_mentions=False, use_rt_mentions=False, context_hl=1.0):
        '''
        Create it!
        Args:
            context_size (int): Number of days to look back
            context_combine (str): Method of combining tweet embeddings of tweets in context
            use_rt_user (bool): User A retweets User B's tweet -- if true,
                    this tweet will be counted in User A and User B's context
            use_mentions (bool): User A tweets, mentioning User B -- if true, 
                    this tweet will be in User A and User B's context
            use_rt_mentions (bool): User A retweets User B's tweet, which mentioned User C -- if true,
                    this tweet will counted in User A and User C's history
            context_hl (int): Half life of context, in days. Tweet embeddings will be weighed according to
                    (self.decay_rate)^(t/context_hl) where t is the number of days the previous tweet is 
                    from the current one.
        '''
        
        # need data loader eventually?
        
        self.context_size = context_size
        self.context_combine = context_combine
        self.use_rt_user = use_rt_user
        self.use_mentions = use_mentions
        self.use_rt_mentions = use_rt_mentions
        self.context_hl = context_hl
        self.decay_rate = 0.5 # hardcoded!
        
        self.user_ct_tweets = {}
        self.all_data = dl.all_data()
        
        # Map from tweet id to tuple of (user, idx in sorted list)
        # Note that "user" is user_post, the user who posted the tweet
        self.id_to_location = {}
        
        # Tweet to context embedding
        self.tweet_to_ct = {}
        
        # Cache for calculated tweet embeddings
        self.tweet_emb_cache = {}
        
        # Hardcoding dimension to 303 (300 w2v, 3 splex) -- remove later
        self.embeddings_dim = 300 + 3
    
    
    def create_user_context_tweets(self):
        ''' Describe! '''
        
        # For every tweet in the dataset (labled and unlabeled)
        for tweet in self.all_data:
            incl_users = set()
            # Always include poster
            incl_users.add(tweet['user_post'])
            # Check if tweet is a retweet
            if 'user_retweet' in tweet:
                # Include retweeted user
                if self.use_rt_user:
                    incl_users.add(tweet['user_retweet'])
                # Include users mentioned in retweet
                if use_rt_mentions:
                    incl_users.union(tweet['user_mentions'])
            # Include mentioned users (non-retweet case)
            elif use_mentions:
                incl_users.union(tweet['user_mentions'])
            
            # Add tweets to users' context tweets
            for u in incl_users:
                if u in self.user_ct_tweets:
                    self.user_ct_tweets[u].append(tweet)
                else:
                    self.user_ct_tweets[u] = [tweet]
        
        # Sort context tweets chronologically
        for u in self.user_ct_tweets:
            self.user_ct_tweets[u] = sorted(self.user_ct_tweets[u], key=lambda t: t['created_at'])
            
        # Go through the tweets to save their location
        for u, tweets in self.user_ct_tweets.items():
            for idx, t in enumerate(tweets):
                if u == t['user_post']:
                    self.id_to_location[t['tweet_id']] = (u, idx)
    
    
    def get_tweet_embedding(self, tweet_id):
        '''
        Get the tweet embedding for the given tweet.
        Args:
            tweet_id (int): the id of the tweet, according to twitter's ID system
        Returns:
            the tweet embedding
        '''
        if tweet_id in self.tweet_emb_cache: # Check cache for embedding
            return self.tweet_emb_cache[tweet_id]
        else:
            w_emb = tl_w2v.get_representation(tweet_id, mode='avg')
            sp_emb =  tl_splex.get_representation(tweet_id, mode='avg')
            full_emb = np.concatenate([w_emb, sp_emb])
            self.tweet_emb_cache[tweet_id] = full_emb # Save embedding to cache
            return full_emb
    
    
    def create_context_embedding(self, user_id, tweet_idx):
        '''
        Get the context embedding for the given tweet, determined by user and index.
        Args:
            user_id (int): the id of the user, according to data_loader's user ids
            tweet_idx (int): the index of the tweet in self.user_ct_tweets[user_id]
        '''
        # Check if context embedding is in the cache
        tweet_id = self.user_ct_tweets[user_id][tweet_idx]['tweet_id']
        if tweet_id in self.tweet_to_ct:
            return self.tweet_to_ct[tweet_id]
        
        # Return difference in days, as a float
        def days_diff(d1, d2):
            return (d1 - d2).seconds/60/60/24
        
        tweet_embs = []
        
        today = self.user_ct_tweets[user_id][tweet_idx]['created_at']
        i = tweet_idx-1
        while i >= 0 and days_diff(today, self.user_ct_tweets[user_id][i]['created_at']) \
                                     < self.context_size:
            # Get embedding -- may need to change
            emb = self.get_tweet_embedding(self.user_ct_tweets[user_id][i]['tweet_id'])
            # Weigh embedding
            diff = days_diff(today, self.user_ct_tweets[user_id][i]['created_at'])
            weight = self.decay_rate ** (diff/self.context_hl)
            emb = emb * weight
            # Save
            tweet_embs.append(emb)
            i -= 1
        
        result = None
        if len(tweet_embs) == 0:
            result = np.zeros(self.embeddings_dim, )
        else:
            if self.context_combine == 'avg':
                result = np.mean(np.array(tweet_embs), axis=0)
            elif self.context_combine == 'sum':
                result = sum(tweet_embs)
            elif self.context_combine == 'max':
                result = np.max(np.array(tweet_embs), axis=0)
            else:
                raise ValueError('Unknown settting for context_combine:', context_combine)
        
        # Cache the result
        self.tweet_to_ct[tweet_id] = result
        return result
    
    
    def create_context_embeddings(self):
        '''
        Create the context embeddings for the tweets.
        '''
        self.tweet_to_ct = {} # Reset embeddings
        
        for fold_idx in range(0, 5): # change to 5
            tr, val, test = dl.cv_data(fold_idx)
            all_tweets = [t for l in [tr, val, test] for t in l ]
            print(len(all_tweets))
            for tweet in all_tweets: 
                self.tweet_to_ct[tweet['tweet_id']] = self.create_context_embedding(
                    *self.id_to_location[tweet['tweet_id']])
            print('done with:', fold_idx)
    
    
    def get_context_embedding(self, tweet_id):
        '''
        Get the context embedding for the specified tweet, determined by tweet_id
        Args:
            tweet_id (int): the id of the tweet, according to the twitter tweet ids
        Returns:
            (np.array(int)): the context embedding 
        '''
        if len(self.tweet_to_ct) == 0:
            raise ValueError('Context embeddings have not been created yet. Call create_context_embeddings().')
        if tweet_id not in self.tweet_to_ct:
            raise ValueError('No calcualted context embedding for given tweet_id:', tweet_id)
        
        return self.tweet_to_ct[tweet_id]

    
    def from_file(self, in_file):
        '''
        Reads the context embeddings in from a file.
        Args:
            in_file (str): the path to the file, in csv format, <tweet_id>, <embedding>
        Returns:
            None
        '''
        with open(in_file, newline='') as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                self.tweet_to_ct[int(row['tweet_id'])] = np.fromstring(row['context_embedding'],
                                                                    dtype=float, sep=' ')
        

    
    def write_context_embeddings(self, out_file=None):
        '''
        Writes the embeddings to a file.
        Args:
            out_file (str): the path of the file to write to
        Returns:
            None
        '''
        if not out_file:
            out_file = 'context_emb_{0}_{1}_rt{2}_men{3}_rtmen{4}_hl{5}.csv' \
                        .format(self.context_size, self.context_combine, self.use_rt_user, 
                                self.use_mentions, self.use_rt_mentions, self.context_hl)
        with open(out_file, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile, delimiter=',')
            writer.writerow(['tweet_id', 'context_embedding'])
            for tweet_id, ct_emb in self.tweet_to_ct.items():
                ct_emb_str = ' '.join([str(x) for x in ct_emb])
                writer.writerow([tweet_id, ct_emb_str])
                
                

    
    
            
            
        
            
                

In [25]:
# Tester/usage

# number of days
context_size = 5 

# method of combining tweet embeddings
context_combine = 'avg' 

# User A retweets User B's tweet -- if true: the retweet will be counted in both User A and User B's context
use_rt_user = False

# User A tweets, mentioning User B -- if true: this tweet will be in User A and User B's context
use_mentions = True

# User A retweets User B's tweet, which originally mentioned User C -- if true: counted in A, B and C's history
use_rt_mentions = False

# the data loader
data_loader = dl

# will eventually take params, but for now all we need is the data loader
contextifier = Contextifier(context_size, context_combine, use_rt_user, use_mentions, use_rt_mentions, context_hl=1.0)


In [17]:
contextifier.create_user_context_tweets()

In [18]:
contextifier.create_context_embeddings()

7842
done with: 0
7842
done with: 1
7842
done with: 2
7842
done with: 3
7842
done with: 4


In [48]:
print(len(contextifier.tweet_to_ct))

[{'created_at': datetime.datetime(2012, 9, 10, 18, 31, 39), 'user_mentions': [6083], 'user_post': 25, 'tweet_id': 245228056422252544, 'int_arr': [2, 183], 'padded_int_arr': [2, 183, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}, {'created_at': datetime.datetime(2012, 10, 17, 20, 13, 41), 'user_mentions': [7831], 'user_post': 25, 'tweet_id': 258662084089368576, 'int_arr': [2, 16, 60, 10, 219, 259, 16, 142, 538], 'padded_int_arr': [2, 16, 60, 10, 219, 259, 16, 142, 538, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}]


In [None]:
contextifier.get_context_embedding(245228056422252544)

In [10]:
contextifier.write_context_embeddings()

In [58]:
contextifier2 = Contextifier()
contextifier2.from_file('context_emb_5_avg_rtFalse_menTrue_rtmenFalse_hl1.0.csv')

In [59]:
contextifier2.get_context_embedding(832351449069846528)

array([-2.29765159e-01, -7.57820561e-02, -2.33109747e-02, -7.06422932e-02,
       -2.27849730e-02, -1.07100463e-01, -1.65977761e-02,  1.44108103e-01,
       -3.63823417e-02,  5.68953571e-02,  1.14796710e-01,  1.33025157e-01,
        2.92387255e-02, -3.38103699e-02, -7.10422493e-02, -1.16686263e-01,
       -4.57345906e-02, -1.24062378e-01,  3.16525381e-02,  5.46709085e-02,
        2.09999887e-01,  4.17215812e-02,  2.01643577e-01, -5.36820738e-03,
        9.94055704e-02,  4.52494678e-02,  1.11036305e-01,  1.02455396e-01,
        3.17793973e-02,  1.68900885e-01, -2.67782665e-01, -6.24724609e-02,
       -1.28560938e-02, -1.03814518e-01,  1.24277295e-01, -9.12616807e-03,
        8.69174185e-02,  7.29920363e-02, -5.31217368e-02, -1.70796777e-01,
       -1.01166716e-01, -9.64308399e-02, -7.73154602e-02,  3.18801045e-02,
        1.39264479e-02, -7.16256419e-02, -4.30586126e-02,  3.28289078e-02,
        1.23525254e-01,  2.21787181e-02, -4.53831046e-02, -3.42465475e-02,
        2.02838960e-02,  