In [2]:
from sklearn.feature_extraction.text import CountVectorizer
import sklearn.svm
import sklearn.metrics as skm
from scipy.sparse import csr_matrix, hstack
import numpy as np

In [100]:
import csv
import numpy as np
from data_loader import Data_loader
from represent_tweet_level import TweetLevel

class Contextifier:
    '''
        Creates the context for tweets.
    '''
    # Magic strings to determine relationship between user and tweet
    SELF = 'SELF'
    RETWEET = 'RETWEET'
    MENTION = 'MENTION'
    RETWEET_MENTION = 'RETWEET_MENTION'
    
    
    def __init__(self, data_loader, context_size=1, use_rt_user=False, 
                 use_mentions=False, use_rt_mentions=False, context_hl_ratio=1.0,
                 word_emb_file='../data/w2v_word_s300_w5_mc5_it20.bin',
                 word_emb_type='w2v',
                 word_emb_mode='avg',
                 use_word_ct=True,
                 splex_emb_file='../data/splex_standard_svd_word_s300_seeds_hc.pkl',
                 splex_emb_mode='sum',
                 use_splex_ct=True,
                 keep_stats=False
                 ):
        '''
        Create it!
        Args:
            data_loader (Data_loader): an instance of the Data_loader class, from which to obtain data.
            context_size (float): Number of days to look back
            use_rt_user (bool): User A retweets User B's tweet -- if true,
                    this tweet will be counted in User A and User B's context
            use_mentions (bool): User A tweets, mentioning User B -- if true, 
                    this tweet will be in User A and User B's context
            use_rt_mentions (bool): User A retweets User B's tweet, which mentioned User C -- if true,
                    this tweet will counted in User A and User C's history
            context_hl_ratio (float): Ratio of half life to context size. Tweet embeddings will be weighed according to
                    (self.decay_rate)^(t/x) where t is the number of days the previous tweet is 
                    from the current one, and x is context_size * context_hl Set to 0 for no weighting/decay.
            word_emb_file (str): the path to the file to saved word embeddings
            word_emb_file (str): the type of the word embedding file, e.g. 'w2v'. See TweetLevel for more info.
            word_emb_mode (str): the mode to use when combining word embeddings at TweetLevel, e.g. 'avg'
            use_word_ct (bool): if true, word embeddings of --context-- will be used
            splex_emb_file (str): the pickle file that contains the splex embeddings.
            splex_emb_mode (str): the mode to use when combining splex scores at TweetLevel, e.g. 'sum'
            use_splex_ct (bool): if true, splex embeddings of --context-- will be used
            keep_stats (bool): if true, keep stats (e.g. what tweets are in a context window)
        '''
        # Save variables
        self.set_context_size(context_size)
        self.use_rt_user = None
        self.use_mentions = None
        self.use_rt_mentions = None
        self.set_use_rt_user(use_rt_user)
        self.set_use_mentions(use_mentions)
        self.set_use_rt_mentions(use_rt_mentions)
        self.set_context_hl_ratio(context_hl_ratio)
        
        
        # Load data
        self.all_data = data_loader.all_data()
        
        
        # Tweet to context embedding cache
        self.tweet_to_ct = {}
        
        # Initializing tools to get tweet-level embeddings
        self.word_emb_file = None
        self.splex_emb_file = None
        self.set_embeddings(word_emb_file, word_emb_type, word_emb_mode, use_word_ct, 
                            splex_emb_file, splex_emb_mode, use_splex_ct)
        

        # Keeping stats
        self.set_keep_stats(keep_stats)

            
    def set_context_size(self, context_size):
        self.context_size = context_size
        self.reset_context_embeddings()
        self.reset_stats()
        
    def set_use_rt_user(self, use_rt_user):
        self.use_rt_user = use_rt_user
        self._set_post_types() # update post types
    
    def set_use_mentions(self, use_mentions):
        self.use_mentions = use_mentions
        self._set_post_types() # update post types
    
    def set_use_rt_mentions(self, use_rt_mentions):
        self.use_rt_mentions = use_rt_mentions
        self._set_post_types() # update post types

    
    def set_context_hl_ratio(self, context_hl_ratio):
        self.context_hl_ratio = context_hl_ratio
        self.decay_rate = 0.5 # hardcoded!
        self.reset_context_embeddings()
        
    def set_embeddings(self, word_emb_file, word_emb_type, word_emb_mode, use_word_ct,
                       splex_emb_file, splex_emb_mode, use_splex_ct):
        # Cache for combined tweet embeddings
        self.tweet_emb_cache = {}
        
        self.use_word_ct = use_word_ct
        # Initializing tools to get tweet-level embeddings
        if self.use_word_ct and self.word_emb_file != word_emb_file: # don't reload if hasn't changed
            self.word_emb_file = word_emb_file
            self.tl_word = TweetLevel(word_level=word_emb_file, wl_file_type=word_emb_type)
        self.word_emb_mode = word_emb_mode
        # Cache for calculated tweet-level word embeddings
        self.tweet_word_cache = {}
        
        self.use_splex_ct = use_splex_ct
        if self.use_splex_ct and self.splex_emb_file != splex_emb_file: # don't reload if hasn't changed
            self.splex_emb_file = splex_emb_file
            self.tl_splex = TweetLevel(word_level=splex_emb_file, wl_file_type='pkl')
        self.splex_emb_mode = splex_emb_mode
        # Cache for calculated tweet-level splex embeddings
        self.tweet_splex_cache = {}
        
        # Hardcoding embedding size -- unsure how to change this
        self.embeddings_dim = 300 + 3
        
        self.reset_context_embeddings()
    
        
    def set_keep_stats(self, keep_stats):
        self.keep_stats = keep_stats
        if self.keep_stats:
            # Tweet id to tweet ids in context window
            self.tweet_to_ct_tweets = {}
        self.reset_context_embeddings()
            
            
    def _set_post_types(self):
        # Update post types to use in context
        self.post_types = set()
        self.post_types.add(self.SELF) # Always include self posts
        if self.use_rt_user:
            self.post_types.add(self.RETWEET)
        if self.use_rt_mentions:
            self.post_types.add(self.RETWEET_MENTION)
        if self.use_mentions:
            self.post_types.add(self.MENTION)
        self.reset_context_embeddings() # Reset cache
        self.reset_stats() # Reset stats
        
    
    def create_user_context_tweets(self):
        '''
        Sorts the tweets into self.user_ct_tweets, based on the variables
            self.use_rt_user, self.use_rt_mentions, and self.use_mentions
        '''
        # Tweets in a user's "context"
        self.user_ct_tweets = {}
        
        # Map from tweet id to tuple of (user, idx in sorted list)
        # Note that "user" is user_post, the user who posted the tweet
        self.id_to_location = {}
        
        # For every tweet in the dataset (labled and unlabeled)
        for tweet in self.all_data:
            incl_users = []
            # Always include poster
            incl_users.append((tweet['user_post'], self.SELF))
            # Check if tweet is a retweet
            if 'user_retweet' in tweet:
                incl_users.append((tweet['user_retweet'], self.RETWEET))
                # Include users mentioned in retweet
                rt_mentions = [(u, self.RETWEET_MENTION) for u in tweet['user_mentions']]
                incl_users.extend(rt_mentions)
            else:
                # Include users mentioned (not retweet)
                mentions = [(u, self.MENTION) for u in tweet['user_mentions']]
                incl_users.extend(mentions)
            
            # Add tweets to users' context tweets
            for u, post_type in incl_users:
                if u in self.user_ct_tweets:
                    self.user_ct_tweets[u].append((tweet, post_type))
                else:
                    self.user_ct_tweets[u] = [(tweet, post_type)]
        
        # Sort context tweets chronologically
        for u in self.user_ct_tweets:
            self.user_ct_tweets[u] = sorted(self.user_ct_tweets[u], key=lambda t: t[0]['created_at'])
            
        # Go through the tweets to save their location
        for u, tweets in self.user_ct_tweets.items():
            for idx, t in enumerate(tweets):
                if u == t[0]['user_post']:
                    self.id_to_location[t[0]['tweet_id']] = (u, idx)
    
    
    def get_tweet_embedding(self, tweet_id):
        '''
        Get the tweet embedding for the given tweet.
        Args:
            tweet_id (int): the id of the tweet, according to twitter's ID system
        Returns:
            the tweet embedding
        '''
        if tweet_id in self.tweet_emb_cache: # Check cache for embedding
            return self.tweet_emb_cache[tweet_id]
        else:
            w_emb = self.tl_word.get_representation(tweet_id, mode=self.word_emb_mode)
            sp_emb =  self.tl_splex.get_representation(tweet_id, mode=self.splex_emb_mode)
            full_emb = np.concatenate([w_emb, sp_emb])
            self.tweet_emb_cache[tweet_id] = full_emb # Save embedding to cache
            return full_emb

        
    def get_word_embedding(self, tweet_id):
        # add cache back in here
        if tweet_id in self.tweet_word_cache:
            return self.tweet_word_cache[tweet_id]
        else:
            res = self.tl_word.get_representation(tweet_id, mode=self.word_emb_mode)
            self.tweet_word_cache[tweet_id] = res
            return res

        
    def get_splex_embedding(self, tweet_id):
        # add cache back in here
        if tweet_id in self.tweet_splex_cache:
            return self.tweet_splex_cache[tweet_id]
        else:
            res = self.tl_splex.get_representation(tweet_id, mode=self.splex_emb_mode)
            self.tweet_splex_cache[tweet_id] = res
            return res


    def combine_embeddings(self, embeddings, mode):
        # documentation
        result = None
        if mode == 'avg':
            result = np.mean(np.array(embeddings), axis=0)
        elif mode == 'sum':
                result = sum(embeddings)
        elif mode == 'max':
            result = np.max(np.array(embeddings), axis=0)
        else:
            raise ValueError('Unknown combination method:', mode)
        return result
    
    
    def create_context_embedding(self, user_id, tweet_idx):
        '''
        Get the context embedding for the given tweet, determined by user and index.
        Args:
            user_id (int): the id of the user, according to data_loader's user ids
            tweet_idx (int): the index of the tweet in self.user_ct_tweets[user_id]
        '''
        # Check if context embedding is in the cache
        tweet_id = self.user_ct_tweets[user_id][tweet_idx][0]['tweet_id']
        if tweet_id in self.tweet_to_ct:
            return self.tweet_to_ct[tweet_id]
        
        # Return difference in days, as a float
        def days_diff(d1, d2):
            return (d1 - d2).total_seconds() / 60 / 60 / 24
        
        w_embs = [] # word embeddings
        splex_embs = [] # splex embeddings
        tweet_ids = [] # for stats
        context_hl = self.context_size * self.context_hl_ratio # set half life
        
        today = self.user_ct_tweets[user_id][tweet_idx][0]['created_at']
        i = tweet_idx-1
        while i >= 0 and days_diff(today, self.user_ct_tweets[user_id][i][0]['created_at']) \
                                     < self.context_size:
            
            # Confirm post type is one we want to include
            post_type = self.user_ct_tweets[user_id][i][1]
            if post_type not in self.post_types:
                i -= 1
                continue 
            
            # Save tweet ids
            if self.keep_stats:
                tweet_ids.append(self.user_ct_tweets[user_id][i][0]['tweet_id'])

            # Get embeddings -- may need to change
            w_emb = self.get_word_embedding(self.user_ct_tweets[user_id][i][0]['tweet_id'])
            splex_emb = self.get_splex_embedding(self.user_ct_tweets[user_id][i][0]['tweet_id'])

            # Weigh embedding
            if context_hl != 0:
                diff = days_diff(today, self.user_ct_tweets[user_id][i][0]['created_at'])
                weight = self.decay_rate ** (diff/context_hl)
                w_emb = w_emb * weight
                splex_emb = splex_emb * weight

            # Save
            w_embs.append(w_emb)
            splex_embs.append(splex_emb)
            i -= 1

        # Save stats
        if self.keep_stats:
            self.tweet_to_ct_tweets[tweet_id] = tweet_ids
        
        # Combine word embeddings
        w_comb = None
        if len(w_embs) == 0:
            w_comb = np.zeros(300, ) #AH! i don't have to hardcode these now
        else:
            w_comb = self.combine_embeddings(w_embs, self.word_emb_mode)

        # Combine splex embeddings
        splex_comb = None
        if len(splex_embs) == 0:
            splex_comb = np.zeros(3, ) # still hardcoded
        else:
            splex_comb = self.combine_embeddings(splex_embs, self.splex_emb_mode)    
        
        # Check if we're using word embeddings and splex embeddings
        to_use = []
        if self.use_word_ct:
            to_use.append(w_comb)
        if self.use_splex_ct:
            to_use.append(splex_comb)
            
        # Concatenate to get result
        result = np.concatenate(to_use)

        # Cache the result
        self.tweet_to_ct[tweet_id] = result
        return result

    def reset_context_embeddings(self):
        self.tweet_to_ct = {} # Reset embeddings
    
    def reset_stats(self):
        self.tweet_to_ct_tweets = {} # Reset stats
    
    
    def create_context_embeddings(self):
        '''
        Create the context embeddings for the tweets.
        '''
        for fold_idx in range(0, 5):
            tr, val, test = self.dl.cv_data(fold_idx)
            all_tweets = [t for l in [tr, val, test] for t in l ]
            for tweet in all_tweets: 
                self.tweet_to_ct[tweet['tweet_id']] = self.create_context_embedding(
                    *self.id_to_location[tweet['tweet_id']])
    
    
    def get_context_embedding(self, tweet_id):
        '''
        Get the context embedding for the specified tweet, determined by tweet_id
        Args:
            tweet_id (int): the id of the tweet, according to the twitter tweet ids
        Returns:
            (np.array(int)): the context embedding 
        '''
        if len(self.user_ct_tweets) == 0:
            raise ValueError('User contexts have not been created. First call .create_user_context_tweets().')
        if tweet_id in self.tweet_to_ct:
            return self.tweet_to_ct[tweet_id]
        else:
            # note: some weirdness going on here with loading from files
            return self.create_context_embedding(*self.id_to_location[tweet_id])


    def get_context_tweets(self, tweet_id):
        # return ids of tweets in context
        if tweet_id in self.tweet_to_ct_tweets:
            return self.tweet_to_ct_tweets[tweet_id]
        else:
            raise ValueError('no calculated tweet ids in context') # fix this

    
    def from_file(self, in_file):
        '''
        Reads the context embeddings in from a file.
        Args:
            in_file (str): the path to the file, in csv format, <tweet_id>, <embedding>
        Returns:
            None
        '''
        with open(in_file, newline='') as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                self.tweet_to_ct[int(row['tweet_id'])] = np.fromstring(row['context_embedding'],
                                                                    dtype=float, sep=' ')
        

    
    def write_context_embeddings(self, out_file=None):
        '''
        Writes the embeddings to a file.
        Args:
            out_file (str): the path of the file to write to
        Returns:
            None
        '''
        if not out_file:
            out_file = 'context_emb_{0}_{1}_rt{2}_men{3}_rtmen{4}_hlr{5}_.csv' \
                        .format(self.context_size, self.context_combine, self.use_rt_user, 
                                self.use_mentions, self.use_rt_mentions, self.context_hl_ratio)
        with open(out_file, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile, delimiter=',')
            writer.writerow(['tweet_id', 'context_embedding'])
            for tweet_id, ct_emb in self.tweet_to_ct.items():
                ct_emb_str = ' '.join([str(x) for x in ct_emb])
                writer.writerow([tweet_id, ct_emb_str])

In [42]:
from data_loader import Data_loader
option = 'word'
max_len = 53
vocab_size = 30000
dl = Data_loader(vocab_size=vocab_size, max_len=max_len, option=option)

Loading vocabulary ...
30000 vocab is considered.
Loading tweets ...
Processing tweets ...
Data loader initialization finishes


In [101]:
contextifier = Contextifier(dl)
# Create the contexts (only needs to be done once)
contextifier.create_user_context_tweets()
print('Done!')

Initializing TweetLevel...
Number of embeddings in ../data/w2v_word_s300_w5_mc5_it20.bin: 23417
Sample tweet_dict item: (740043438788345856, [2, 254, 440, 192, 94, 57, 72, 77])
Size of tweet_dict: 1033655
Initializing TweetLevel...
Number of embeddings in ../data/splex_standard_svd_word_s300_seeds_hc.pkl: 20000
Sample tweet_dict item: (740043438788345856, [2, 254, 440, 192, 94, 57, 72, 77])
Size of tweet_dict: 1033655
Done!


In [106]:
from sklearn.model_selection import ParameterGrid

# param_grid = {'context_size': [0.1, 0.25, 0.5, 2, 7, 14],
#               'use_rt_user': [True, False],
#               'use_mentions': [True, False],
#               'use_rt_mentions': [True, False],
#               'context_hl_ratio': [0, 0.1, 0.25, 0.5], # relative to size
#               'word_emb_file': ['../data/w2v_word_s300_w5_mc5_it20.bin'],
#               'word_emb_type': ['w2v'],
#               'word_emb_mode': ['avg'],
#               'use_word_ct': [False],
#               'splex_emb_file': ['../data/splex_minmax_svd_word_s300_seeds_hc.pkl'],
#               'splex_emb_mode':['sum'],
#               'use_splex_ct': [True],
#               'keep_stats': [True]
#              }

param_grid = {'context_size': [1000],
              'use_rt_user': [False],
              'use_mentions': [False],
              'use_rt_mentions': [False],
              'context_hl_ratio': [0], # relative to size
              'word_emb_file': ['../data/w2v_word_s300_w5_mc5_it20.bin'],
              'word_emb_type': ['w2v'],
              'word_emb_mode': ['avg'],
              'use_word_ct': [False],
              'splex_emb_file': ['../data/splex_minmax_svd_word_s300_seeds_hc.pkl'],
              'splex_emb_mode':['sum'],
              'use_splex_ct': [True],
              'keep_stats': [True]
             }





grid = ParameterGrid(param_grid)

best_f = 0
best_params = None
best_context = 0

for params in grid:
    contextifier.set_context_size(params['context_size'])
    contextifier.set_use_rt_user(params['use_rt_user'])
    contextifier.set_use_rt_mentions(params['use_rt_mentions'])
    contextifier.set_context_hl_ratio(params['context_hl_ratio'])
    contextifier.set_embeddings(params['word_emb_file'], 
                                params['word_emb_type'], 
                                params['word_emb_mode'],
                                params['use_word_ct'],
                                params['splex_emb_file'], 
                                params['splex_emb_mode'],
                                params['use_splex_ct'])
    contextifier.set_keep_stats(params['keep_stats'])
    
    total_f = 0
    context_sizes = {}

    class_weight = {
        'Loss' : 0.35,
        'Aggression': 0.5,
        'Other': 0.15
    }

    for fold_idx in range(0, 5):
    #     print('Fold:', fold_idx)
        tr, val, test = dl.cv_data(fold_idx)

        # Set up
#         clf = sklearn.svm.LinearSVC() # no class weights
        clf = sklearn.svm.LinearSVC(class_weight=class_weight) # with class weights
#         vectorizer = CountVectorizer(ngram_range=(1, 1), tokenizer=lambda s: s.split(' '))

        # Training on both TR and VAL -- maybe a good idea?
        all_train_tweets = [t for l in [tr, val] for t in l ]

        # Train
        train_ids = [t['tweet_id'] for t in all_train_tweets]
#         train_texts = [' '.join([str(i) for i in t['int_arr']]) for t in all_train_tweets] # treat as texts of numbers
#         X_train = vectorizer.fit_transform(train_texts)
        y_train = [t['label'] for t in all_train_tweets]
        tweet_embs, context_embs = [], []
        for t_id in train_ids:
                tweet_embs.append(contextifier.get_tweet_embedding(t_id))
                context_embs.append(contextifier.get_context_embedding(t_id))
                context_sizes[t_id] = len(contextifier.get_context_tweets(t_id)) #context size

        X_train = hstack([csr_matrix(np.array(tweet_embs)), csr_matrix(np.array(context_embs))])
#         X_train = hstack([csr_matrix(np.array(tweet_embs))])
        clf.fit(X_train, y_train)

#         test = val # test on val

        # Test
        test_ids = [t['tweet_id'] for t in test] 
#         test_texts = [' '.join([str(i) for i in t['int_arr']]) for t in test] # treat as texts of numbers
#         X_test = vectorizer.transform(test_texts)
        y_test = [t['label'] for t in test]
        tweet_embs, context_embs = [], []
        for t_id in test_ids:
                tweet_embs.append(contextifier.get_tweet_embedding(t_id))
                context_embs.append(contextifier.get_context_embedding(t_id))
                context_sizes[t_id] = len(contextifier.get_context_tweets(t_id)) # context size

        X_test = hstack([csr_matrix(np.array(tweet_embs)), csr_matrix(np.array(context_embs))])
#         X_test = hstack([csr_matrix(np.array(tweet_embs))])
        y_predicted = clf.predict(X_test)

        # Results
        p, r, f, _ = skm.precision_recall_fscore_support(y_test, y_predicted, average='macro')
        total_f += f

    avg_f = total_f / 5
    avg_context = sum(context_sizes.values())/len(context_sizes)

    print('Avg F-score:', avg_f)
    print('Avg number of context tweets in window:', avg_context)
    print(params)
    

    if avg_f > best_f:
        best_f = avg_f
        best_params = params
        best_context = avg_context



print('BEST F:', best_f)
print('BEST CONTEXT:', best_context)
print('BEST PARAMS:', best_params)

Avg F-score: 0.49028847364223027
Avg number of context tweets in window: 2766.3819178780923
{'context_hl_ratio': 0, 'context_size': 1000, 'keep_stats': True, 'splex_emb_file': '../data/splex_minmax_svd_word_s300_seeds_hc.pkl', 'splex_emb_mode': 'sum', 'use_mentions': False, 'use_rt_mentions': False, 'use_rt_user': False, 'use_splex_ct': True, 'use_word_ct': False, 'word_emb_file': '../data/w2v_word_s300_w5_mc5_it20.bin', 'word_emb_mode': 'avg', 'word_emb_type': 'w2v'}
BEST F: 0.49028847364223027
BEST CONTEXT: 2766.3819178780923
BEST PARAMS: {'context_hl_ratio': 0, 'context_size': 1000, 'keep_stats': True, 'splex_emb_file': '../data/splex_minmax_svd_word_s300_seeds_hc.pkl', 'splex_emb_mode': 'sum', 'use_mentions': False, 'use_rt_mentions': False, 'use_rt_user': False, 'use_splex_ct': True, 'use_word_ct': False, 'word_emb_file': '../data/w2v_word_s300_w5_mc5_it20.bin', 'word_emb_mode': 'avg', 'word_emb_type': 'w2v'}


In [None]:
len(contextifier.get_context_embedding(832351449069846528))