In [3]:
# import the data loader
from data_loader import Data_loader

In [4]:
# initialization
# word level tokenization
option = 'word'
max_len = 20
vocab_size = 30000
dl = Data_loader(vocab_size=vocab_size, max_len=max_len, option=option)

Loading vocabulary ...
30000 vocab is considered.
Loading user information finished
Loading tweets ...
Processing tweets ...
Data loader initialization finishes


In [4]:
from preprocess import extract_mentioned_user_name
print(extract_mentioned_user_name('RT @TyquanAssassin: im waiting @otheruser @help @noway !'))

{'help', 'otheruser', 'noway'}


In [1]:
from preprocess import extract_user_rt
print(extract_user_rt('RT @TyquanAssassin: im waiting @otheruser !'))

tyquanassassi


In [5]:
print(dl.all_data()[0])

{'user_id': 2955996447, 'user_post': 27, 'tweet_id': 740043438788345856, 'user_mentions': [3613], 'created_at': datetime.datetime(2016, 6, 7, 4, 51, 19), 'int_arr': [2, 254, 440, 192, 94, 57, 72, 77], 'padded_int_arr': [2, 254, 440, 192, 94, 57, 72, 77, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


In [15]:
user_to_tweets = {}
for tweet in dl.all_data():
    if tweet['user_post'] in user_to_tweets:
        user_to_tweets[tweet['user_post']].append(tweet)
    else:
        user_to_tweets[tweet['user_post']] = [tweet]

In [20]:
print(len(user_to_tweets[27]))

8712


In [21]:
print(dl.id2user_name(27))

freekerryhoe


In [22]:
for u in user_to_tweets:
    user_to_tweets[u] = sorted(user_to_tweets[u], key=lambda t: t['created_at'])

In [5]:
tweets = dl.tweets_by_user(27)

In [6]:
print(len(tweets))

8712


In [31]:
# necessary outside things
import csv
import numpy as np

from data_loader import Data_loader
option = 'word'
max_len = 20
vocab_size = 30000
dl = Data_loader(vocab_size=vocab_size, max_len=max_len, option=option)

#wv = KeyedVectors.load_word2vec_format(fname, binary=True)


class Contextifier:
    '''
        Creates the context for tweets.
    '''
    def __init__(self, context_size, context_combine, use_rt_user, use_mentions, use_rt_mentions, context_hl=1.0):
        '''
        Create it!
        Args:
            context_size (int): Number of days to look back
            context_combine (str): Method of combining tweet embeddings of tweets in context
            use_rt_user (bool): User A retweets User B's tweet -- if true,
                    this tweet will be counted in User A and User B's context
            use_mentions (bool): User A tweets, mentioning User B -- if true, 
                    this tweet will be in User A and User B's context
            use_rt_mentions (bool): User A retweets User B's tweet, which mentioned User C -- if true,
                    this tweet will counted in User A and User C's history
            context_hl (int): Half life of context, in days. Tweet embeddings will be weighed according to
                    (self.decay_rate)^(t/context_hl) where t is the number of days the previous tweet is 
                    from the current one.
        '''
        
        # need data loader eventually?
        
        self.context_size = context_size
        self.context_combine = context_combine
        self.use_rt_user = use_rt_user
        self.use_mentions = use_mentions
        self.use_rt_mentions = use_rt_mentions
        self.context_hl = context_hl
        self.decay_rate = 0.5 # hardcoded!
        
        self.user_ct_tweets = {}
        self.all_data = dl.all_data()
        
        # Tweet to context embedding
        self.tweet_to_ct = {}
        
        # Hardcoding dimension to 300 -- remove later
        self.embeddings_dim = 300
    
    
    def create_user_context_tweets(self):
        ''' Describe! '''
        
        # For every tweet in the dataset (labled and unlabeled)
        for tweet in self.all_data:
            incl_users = set()
            # Always include poster
            incl_users.add(tweet['user_post'])
            # Check if tweet is a retweet
            if 'user_retweet' in tweet:
                # Include retweeted user
                if self.use_rt_user:
                    incl_users.add(tweet['user_retweet'])
                # Include users mentioned in retweet
                if use_rt_mentions:
                    incl_users.union(tweet['user_mentions'])
            # Include mentioned users (non-retweet case)
            elif use_mentions:
                incl_users.union(tweet['user_mentions'])
            
            # Add to users' context tweets
            for u in incl_users:
                if u in self.user_ct_tweets:
                    self.user_ct_tweets[u].append(tweet)
                else:
                    self.user_ct_tweets[u] = [tweet]
        
        # Sort context tweets chronologically
        for u in self.user_ct_tweets:
            self.user_ct_tweets[u] = sorted(self.user_ct_tweets[u], key=lambda t: t['created_at'])
    
    
    def get_tweet_embedding(self, tweet_id):
        '''
        Get the tweet embedding for the given tweet.
        Args:
            tweet_id (int): the id of the tweet, according to twitter's ID system
        Returns:
            the tweet embedding
        '''
        return np.zeros(self.embeddings_dim, )
    
    
    def create_context_embedding(self, user_id, tweet_idx):
        '''
        Get the context embedding for the given tweet, determined by user and index.
        Args:
            user_id (int): the id of the user, according to data_loader's user ids
            tweet_idx (int): the index of the tweet in self.user_ct_tweets[user_id]
        '''
        # Return difference in days, as a float
        def days_diff(d1, d2):
            return (d1 - d2).seconds/60/60/24
        
        tweet_embs = []
        
        today = self.user_ct_tweets[user_id][tweet_idx]['created_at']
        i = tweet_idx-1
        while i >= 0 and days_diff(today, self.user_ct_tweets[user_id][i]['created_at']) \
                                     < self.context_size:
            # Get embedding -- may need to change
            emb = self.get_tweet_embedding(self.user_ct_tweets[user_id][i]['tweet_id'])
            # Weigh embedding
            diff = days_diff(today, self.user_ct_tweets[user_id][i]['created_at'])
            weight = self.decay_rate ** (diff/self.context_hl)
            emb = emb * weight
            # Save
            tweet_embs.append(emb)
            i -= 1
        
        result = None
        if len(tweet_embs) == 0:
            result = np.zeros(self.embeddings_dim, )
        else:
            if self.context_combine == 'avg':
                result = np.mean(np.array(tweet_embs), axis=0)
            elif self.context_combine == 'sum':
                result = sum(tweet_embs)
            elif self.context_combine == 'max':
                result = np.max(np.array(tweet_embs), axis=0)
            else:
                raise ValueError('Unknown settting for context_combine:', context_combine)
        return result
    
    
    def create_context_embeddings(self):
        '''
        Create the context embeddings for the tweets.
        '''
        self.tweet_to_ct = {} # Reset embeddings
        # For now, go through all tweets and only pick out the labled ones
        for u in self.user_ct_tweets:
            for idx, t in enumerate(self.user_ct_tweets[u]):
                if 'label' in t:
                    # Save the context embedding
                    self.tweet_to_ct[t['tweet_id']] = self.create_context_embedding(t['user_post'], idx)
            print(u, 'done')
    
    
    def get_context_embedding(self, tweet_id):
        '''
        Get the context embedding for the specified tweet, determined by tweet_id
        Args:
            tweet_id (int): the id of the tweet, according to the twitter tweet ids
        Returns:
            (np.array(int)): the context embedding 
        '''
        if len(self.tweet_to_ct) == 0:
            raise ValueError('Context embeddings have not been created yet. Call create_context_embeddings().')
        if tweet_id not in self.tweet_to_ct:
            raise valueError('No calcualted context embedding for given tweet_id:', tweet_id)
        
        return self.tweet_to_ct[tweet_id]

    
    def write_context_embeddings(self, out_file=None):
        '''
        Writes the embeddings to a file.
        Args:
            out_file (str): the path of the file to write to
        Returns:
            None
        '''
        if not out_file:
            out_file = 'context_emb_{0}_{1}_rt{2}_men{3}_rtmen{4}_hl{5}.csv' \
                        .format(self.context_size, self.context_combine, self.use_rt_user, 
                                self.use_mentions, self.use_rt_mentions, self.context_hl)
        with open(out_file, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile, delimiter=',')
            writer.writerow(['tweet_id', 'context_embedding'])
            for tweet_id, ct_emb in self.tweet_to_ct.items():
                writer.writerow([tweet_id, ct_emb])
                
                

    
    
            
            
        
            
                

Loading vocabulary ...
30000 vocab is considered.
Loading user information finished
Loading tweets ...
Processing tweets ...
Data loader initialization finishes


In [32]:
# Tester/usage

# number of days
context_size = 5 

# method of combining tweet embeddings
context_combine = 'avg' 

# User A retweets User B's tweet -- if true: the retweet will be counted in both User A and User B's context
use_rt_user = False

# User A tweets, mentioning User B -- if true: this tweet will be in User A and User B's context
use_mentions = True

# User A retweets User B's tweet, which originally mentioned User C -- if true: counted in A, B and C's history
use_rt_mentions = False

# the data loader
data_loader = dl

# will eventually take params, but for now all we need is the data loader
contextifier = Contextifier(context_size, context_combine, use_rt_user, use_mentions, use_rt_mentions, context_hl=1.0)


In [33]:
contextifier.create_user_context_tweets()

In [34]:
contextifier.create_context_embeddings()

27 done
59 done
245 done
15 done
93 done
42 done
158 done
2984 done
125 done
41 done
167 done
243 done
157 done
75 done
50 done
45 done
214 done
378 done
3001 done
100 done
194 done
2995 done
111 done
196 done
12 done
188 done
252 done
218 done
273 done
200 done
148 done
162 done
49 done
5 done
124 done
262 done
549 done
3013 done
204 done
147 done
23 done
295 done
71 done
265 done
77 done
61 done
189 done
420 done
2999 done
90 done
253 done
2989 done
215 done
83 done
206 done
244 done
36 done
184 done
81 done
170 done
213 done
104 done
137 done
8 done
187 done
57 done
4 done
17 done
112 done
727 done
208 done
220 done
209 done
55 done
1147 done
379 done
171 done
146 done
64 done
2985 done
203 done
357 done
72 done
47 done
117 done
3003 done
195 done
150 done
20 done
31 done
25 done
145 done
149 done
30 done
116 done
52 done
179 done
177 done
2998 done
256 done
3012 done
216 done
231 done
2996 done
726 done
21 done
2988 done
85 done
207 done
236 done
19 done
96 done
68 done
190 done
3 

In [35]:
print(len(contextifier.user_ct_tweets))

480


In [36]:
contextifier.get_context_embedding(631443130563330048)

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [37]:
contextifier.write_context_embeddings()