In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import sklearn.svm
import sklearn.metrics as skm
from scipy.sparse import csr_matrix, hstack
import numpy as np

In [2]:
from data_loader import Data_loader
option = 'word'
max_len = 53
vocab_size = 30000
dl = Data_loader(vocab_size=vocab_size, max_len=max_len, option=option)

Data loader ...
Loading vocabulary ...
30000 vocab is considered.
Loading tweets ...
Processing tweets ...
Data loader initialization finishes


In [8]:
from represent_context import Contextifier
from represent_tweet_level import TweetLevel
tl_splex = TweetLevel('../data/splex_minmax_svd_word_s300_seeds_hc.pkl')
tl_word = TweetLevel('../data/w2v_word_s300_w5_mc5_it20.bin')
post_types = [Contextifier.SELF, Contextifier.RETWEET, Contextifier.MENTION,
                Contextifier.RETWEET_MENTION]
context_size = 2
context_hl_ratio = 0.5
context_combine = 'avg'
tl_combine = 'sum'

# Create contextifier, feed it splex to get splex context
contextifier = Contextifier(tl_splex, post_types, context_size,
                            context_hl_ratio, context_combine, tl_combine)

# Set context
context = contextifier.assemble_context(dl.all_data())
contextifier.set_context(*context)
emb = contextifier.get_context_embedding(832351449069846528) # get the embedding for a tweet
print('Done!')

Initializing TweetLevel...
Number of word vectors in ../data/splex_minmax_svd_word_s300_seeds_hc.pkl: 20000
Built tweet_dict. Sample tweet_dict item: (740043438788345856, [2, 254, 440, 192, 94, 57, 72, 77])
Size of tweet_dict: 1033655
Initializing TweetLevel...
Number of word vectors in ../data/w2v_word_s300_w5_mc5_it20.bin: 23417
Built tweet_dict. Sample tweet_dict item: (740043438788345856, [2, 254, 440, 192, 94, 57, 72, 77])
Size of tweet_dict: 1033655
Done!


In [23]:
from sklearn.model_selection import ParameterGrid

# param_grid = {'context_size': [0.1, 0.25, 0.5, 2, 7, 14],
#               'use_rt_user': [True, False],
#               'use_mentions': [True, False],
#               'use_rt_mentions': [True, False],
#               'context_hl_ratio': [0, 0.1, 0.25, 0.5], # relative to size
#               'word_emb_file': ['../data/w2v_word_s300_w5_mc5_it20.bin'],
#               'word_emb_type': ['w2v'],
#               'word_emb_mode': ['avg'],
#               'use_word_ct': [False],
#               'splex_emb_file': ['../data/splex_minmax_svd_word_s300_seeds_hc.pkl'],
#               'splex_emb_mode':['sum'],
#               'use_splex_ct': [True],
#               'keep_stats': [True]
#              }

param_grid = {'context_size': [2],
              'post_types': [[Contextifier.SELF]],
              'context_hl_ratio': [1],
              'context_combine': ['sum'],
              'tl_combine': ['sum']
             }





grid = ParameterGrid(param_grid)

best_f = 0
best_params = None
best_context = 0

for params in grid:
    contextifier.set_context_size(params['context_size'])
    contextifier.set_post_types(params['post_types'])
    contextifier.set_context_hl_ratio(params['context_hl_ratio'])
    contextifier.set_context_combine(params['context_combine'])
    contextifier.set_tl_combine(params['tl_combine'])
    
    total_f = 0
    context_sizes = {}

    class_weight = {
        'Loss' : 0.35,
        'Aggression': 0.5,
        'Other': 0.15
    }

    for fold_idx in range(0, 5):
    #     print('Fold:', fold_idx)
        tr, val, test = dl.cv_data(fold_idx)

        # Set up
#         clf = sklearn.svm.LinearSVC() # no class weights
        clf = sklearn.svm.LinearSVC(class_weight=class_weight) # with class weights
        vectorizer = CountVectorizer(ngram_range=(1, 1), tokenizer=lambda s: s.split(' '))

        # Training on both TR and VAL -- maybe a good idea?
        all_train_tweets = [t for l in [tr, val] for t in l ]

        # Train
        train_ids = [t['tweet_id'] for t in all_train_tweets]
#         train_texts = [' '.join([str(i) for i in t['int_arr']]) for t in all_train_tweets] # treat as texts of numbers
#         X_train = vectorizer.fit_transform(train_texts)
        y_train = [t['label'] for t in all_train_tweets]
        word_embs, splex_embs, context_embs = [], [], []
        for t_id in train_ids:
                word_embs.append(tl_word.get_representation(t_id, 'avg'))
                splex_embs.append(tl_splex.get_representation(t_id, 'sum'))
                emb, ct_tweets = contextifier.get_context_embedding(t_id, keep_stats=True)
                context_embs.append(emb)
                context_sizes[t_id] = len(ct_tweets) #context size
                
#         X_train = hstack([csr_matrix(np.array(word_embs)), csr_matrix(np.array(splex_embs)),
#                              csr_matrix(np.array(context_embs))])
        X_train = hstack([csr_matrix(np.array(word_embs)), csr_matrix(np.array(splex_embs))])
        clf.fit(X_train, y_train)

#         test = val # test on val

        # Test
        test_ids = [t['tweet_id'] for t in test] 
#         test_texts = [' '.join([str(i) for i in t['int_arr']]) for t in test] # treat as texts of numbers
#         X_test = vectorizer.transform(test_texts)
        y_test = [t['label'] for t in test]
        word_embs, splex_embs, context_embs = [], [], []
        for t_id in test_ids:
                word_embs.append(tl_word.get_representation(t_id, 'avg'))
                splex_embs.append(tl_splex.get_representation(t_id, 'sum'))
                emb, ct_tweets = contextifier.get_context_embedding(t_id, keep_stats=True)
                context_embs.append(emb)
                context_sizes[t_id] = len(ct_tweets) #context size

#         X_test = hstack([csr_matrix(np.array(word_embs)), csr_matrix(np.array(splex_embs)),
#                              csr_matrix(np.array(context_embs))])
        X_test = hstack([csr_matrix(np.array(word_embs)), csr_matrix(np.array(splex_embs))])
        y_predicted = clf.predict(X_test)

        # Results
        p, r, f, _ = skm.precision_recall_fscore_support(y_test, y_predicted, average='macro')
        total_f += f

    avg_f = total_f / 5
    avg_context = sum(context_sizes.values())/len(context_sizes)

    print('Avg F-score:', avg_f)
    print('Avg number of context tweets in window:', avg_context)
    print(params)
    

    if avg_f > best_f:
        best_f = avg_f
        best_params = params
        best_context = avg_context



print('BEST F:', best_f)
print('BEST CONTEXT:', best_context)
print('BEST PARAMS:', best_params)

TypeError: __init__() got an unexpected keyword argument 'vocab_size'

In [None]:
len(contextifier.get_context_embedding(832351449069846528))