In [3]:
from sklearn.feature_extraction.text import CountVectorizer
import sklearn.svm
import sklearn.metrics as skm
from scipy.sparse import csr_matrix, hstack
import numpy as np

In [4]:
from data_loader import Data_loader
option = 'word'
max_len = 20
vocab_size = 30000
dl = Data_loader(vocab_size=vocab_size, max_len=max_len, option=option)

Loading vocabulary ...
30000 vocab is considered.
Loading tweets ...
Processing tweets ...
Data loader initialization finishes


In [1]:
# Set up the Contextifier
from represent_context import Contextifier
context_size = 14
context_combine = 'avg' 
use_rt_user = True
use_mentions = True
use_rt_mentions = True
context_hl = 0
word_emb_file='../data/w2v_word_s300_w5_mc5_it20.bin'
word_emb_type='w2v'
word_emb_mode='avg'
splex_emb_file='../data/splex_standard_svd_word_s300_seeds_hc.pkl'
splex_emb_mode='sum'
keep_stats = True
contextifier = Contextifier(context_size, context_combine, use_rt_user, use_mentions,
     use_rt_mentions, context_hl, word_emb_file, word_emb_type, word_emb_mode, 
    splex_emb_file, splex_emb_mode, keep_stats)

# Create the contexts
contextifier.create_user_context_tweets()

# Alternatively, read from a file.
# contextifier.from_file('context_emb_2_avg_rtTrue_menTrue_rtmenTrue_hl2_.csv')

Loading vocabulary ...
30000 vocab is considered.
Loading tweets ...
Processing tweets ...
Data loader initialization finishes
Initializing TweetLevel...
Number of embeddings in ../data/w2v_word_s300_w5_mc5_it20.bin: 23417
Sample tweet_dict item: (740043438788345856, [2, 254, 440, 192, 94, 57, 72, 77])
Size of tweet_dict: 1033655
Initializing TweetLevel...
Number of embeddings in ../data/splex_standard_svd_word_s300_seeds_hc.pkl: 20000
Sample tweet_dict item: (740043438788345856, [2, 254, 440, 192, 94, 57, 72, 77])
Size of tweet_dict: 1033655


In [None]:
total_f = 0
context_sizes = {}

class_weight = {
    'Loss' : 0.35,
    'Aggression': 0.5,
    'Other': 0.15
}

for fold_idx in range(0, 5):
    print('Fold:', fold_idx)
    tr, val, test = dl.cv_data(fold_idx)
    
    # Set up
    clf = sklearn.svm.SVC(kernel='linear', class_weight=class_weight) # Maybe add class weights?
    vectorizer = CountVectorizer(ngram_range=(1, 1), tokenizer=lambda s: s.split(' '))
    
    # Training on both TR and VAL -- maybe a good idea?
    all_train_tweets = [t for l in [tr, val] for t in l ]
    
    # Train
    print('Training...')
    train_ids = [t['tweet_id'] for t in all_train_tweets]
    train_texts = [' '.join(str(t['int_arr'])) for t in all_train_tweets] # treat as texts of numbers
    X_train = vectorizer.fit_transform(train_texts)
    y_train = [t['label'] for t in all_train_tweets]
    tweet_embs, context_embs = [], []
    for t_id in train_ids:
            tweet_embs.append(contextifier.get_tweet_embedding(t_id))
            context_embs.append(contextifier.get_context_embedding(t_id))
            context_sizes[t_id] = len(contextifier.get_context_tweets(t_id)) #context size
            
    X_train = hstack([X_train, csr_matrix(np.array(tweet_embs)), csr_matrix(np.array(context_embs))])
#     X_train = hstack([X_train, csr_matrix(np.array(tweet_embs))])
    clf.fit(X_train, y_train)
    
    
    # Test
    print('Testing...')
    test_ids = [t['tweet_id'] for t in test] 
    test_texts = [' '.join(str(t['int_arr'])) for t in test] # treat as texts of numbers
    X_test = vectorizer.transform(test_texts)
    y_test = [t['label'] for t in test]
    tweet_embs, context_embs = [], []
    for t_id in test_ids:
            tweet_embs.append(contextifier.get_tweet_embedding(t_id))
            context_embs.append(contextifier.get_context_embedding(t_id))
            context_sizes[t_id] = len(contextifier.get_context_tweets(t_id)) # context size
            
    X_test = hstack([X_test, csr_matrix(np.array(tweet_embs)), csr_matrix(np.array(context_embs))])
#     X_test = hstack([X_test, csr_matrix(np.array(tweet_embs))])
    y_predicted = clf.predict(X_test)
    
    # Results
    class_rep = sklearn.metrics.classification_report(y_test, y_predicted)
    conf_matrix = sklearn.metrics.confusion_matrix(y_test, y_predicted)
    p, r, f, _ = skm.precision_recall_fscore_support(y_test, y_predicted, average='macro')
    total_f += f
    print(f)

avg_f = total_f / 5

print('Avg F-score:', avg_f)
print('Avg number of context tweets in window:', sum(context_sizes.values())/len(context_sizes))

Fold: 0
Training...


In [None]:
sizes = []
for t_id in train_ids[:1]:
#     tweet_embs.append(contextifier.get_tweet_embedding(t_id))
    temp = contextifier.get_context_embedding(t_id)
    sizes.append(len(contextifier.get_context_tweets(t_id))) #context size

print(sum(sizes) / len(sizes))
for s in sizes:
    print(s)
    
print(dl[train_ids[0]]['created_at'])
for t_id in contextifier.get_context_tweets(train_ids[0]):
    print(dl[t_id]['created_at'])

In [2]:
len(contextifier.get_context_embedding(832351449069846528))

303