In [2]:
from __future__ import print_function
from __future__ import division
import collections
import nltk
import numpy as np
from nltk.tokenize.casual import TweetTokenizer
from w266_common import utils, vocabulary
import re
np.random.seed(266)

In [3]:
import csv
tokenizer = TweetTokenizer()
x_data = []
labels = []
sentences = []
     
  
with open('dta/merged_data_v3.csv', 'r') as csvfile:
    linereader = csv.reader(csvfile, delimiter = '|')
    for i, row in enumerate(linereader):
        if i == 0:
            continue
        sentence, _, sarcasm = row
        sentence = re.sub("RT @[^\s]+:", "retweet", sentence)
        #sentences.append(sentence)
        tokenized_words = tokenizer.tokenize(sentence)
        x_tokens = utils.canonicalize_words(tokenized_words, hashtags =True)
        index = np.argwhere(x_tokens=="sarcasm")
        x_tokens = np.delete(x_tokens, index)
        index = np.argwhere(x_tokens=="sarcastic")
        x_tokens = np.delete(x_tokens, index)
        x_data.append(x_tokens)
        labels.append(int(sarcasm))


shuffle_indices = np.random.permutation(np.arange(len(labels)))
train_split_idx = int(0.7 * len(labels))
test_split_idx  = int(0.9 * len(labels))

train_indices = shuffle_indices[:train_split_idx]
validation_indices = shuffle_indices[train_split_idx:test_split_idx]
test_indices = shuffle_indices[test_split_idx:]

x_data = np.array(x_data)
labels = np.array(labels)
train_sentences = x_data[train_indices]
train_labels= labels[train_indices] 
validation_sentences = x_data[validation_indices]
validation_labels = labels[validation_indices]
test_sentences = x_data[test_indices]  
test_labels = labels[test_indices]  



In [4]:
vocab = vocabulary.Vocabulary(utils.flatten(train_sentences),5000)
x_ids = vocab.words_to_ids(train_sentences[0])
print("x_ids =", vocab.word_to_id)
print(x_ids)

[10, 71, 67, 8, 12, 74, 54, 24, 3, 3, 75, 22, 3, 78, 70, 3, 16, 76, 72, 49, 13]


In [5]:
list_of_train_ids = list(map(vocab.words_to_ids, train_sentences))
print(list_of_train_ids[0:20])


list_of_validation_ids = list(map(vocab.words_to_ids, validation_sentences))

list_of_test_ids = list(map(vocab.words_to_ids, test_sentences))


[[10, 71, 67, 8, 12, 74, 54, 24, 3, 3, 75, 22, 3, 78, 70, 3, 16, 76, 72, 49, 13], [10, 3610, 15, 2, 905, 330, 156, 311, 529, 3611, 2781, 5, 3, 3, 3, 77, 589, 13], [10, 3612, 306, 7, 2267, 733, 9, 29, 26, 48, 117, 21, 2, 396, 396, 396, 366, 42, 3, 3, 3, 3, 77, 5], [10, 906, 169, 2268, 194, 19, 2269, 53, 1922, 21, 7, 125, 6, 839, 342, 8, 331, 704, 11, 377, 165, 3613, 11, 3], [2782, 2270, 629, 3, 336, 231, 1325, 12, 786, 11, 5, 3, 3, 3, 66, 8, 734, 12, 107, 2783, 356, 21, 983, 38, 529, 3], [984, 14, 7, 125, 2784, 2785, 3, 3, 3, 3, 3, 3, 5], [20, 557, 92, 7, 225, 787, 36, 170, 8, 32, 19, 3, 3], [10, 3, 3614, 343, 3, 64, 3615, 3, 5, 5], [10, 666, 3, 2271, 38, 630, 3616, 2, 6, 3616, 16, 12, 2786, 3617, 15, 3, 2, 18, 3, 9, 630, 14, 3, 13], [10, 2, 8, 56, 7, 3, 16, 985, 9, 213, 631, 357, 3618, 83, 47, 1326, 289, 1652, 25, 256, 15, 122, 2, 7, 2, 13], [5, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 351, 632, 12, 104, 46, 840, 43, 788, 4, 14, 1197, 3, 3, 3,

In [6]:
collections.Counter(list_of_train_ids[0])

Counter({3: 4,
         8: 1,
         10: 1,
         12: 1,
         13: 1,
         16: 1,
         22: 1,
         24: 1,
         49: 1,
         54: 1,
         67: 1,
         70: 1,
         71: 1,
         72: 1,
         74: 1,
         75: 1,
         76: 1,
         78: 1})

In [7]:
from scipy.sparse import csr_matrix 
def sparsify_data(list_of_ids):
    row_indices = []
    col_indices = []
    values = []
    rows = len(list_of_ids)

    for row, x_ids in enumerate(list_of_ids):
        x_fdict = collections.Counter(x_ids)
        for wordid, count in x_fdict.items():
            row_indices.append(row)       
            col_indices.append(wordid)  # column is word id
            values.append(count)        # value is count
    x_sparse = csr_matrix((values, (row_indices, col_indices)),
                              shape=[rows, vocab.size])
    return x_sparse

In [8]:
x_train_sparse = sparsify_data(list_of_train_ids)
x_validation_sparse = sparsify_data(list_of_validation_ids)
x_test_sparse = sparsify_data(list_of_test_ids)


In [9]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score

nb = MultinomialNB(alpha =  1)
nb.fit(x_train_sparse, train_labels)
y_pred = nb.predict(x_test_sparse)





f1 = f1_score(test_labels, y_pred)
acc = accuracy_score(test_labels, y_pred)
recall = recall_score(test_labels, y_pred)
precision = precision_score(test_labels, y_pred)
print(" acc: {:.02%}, recall: {:.02%}, precision: {:.02%}, f1: {:.02%},".format(acc, recall, precision, f1))


 acc: 86.14%, recall: 87.76%, precision: 84.82%, f1: 86.27%,


In [10]:
linear_weights = nb.feature_log_prob_[1,:] - nb.feature_log_prob_[0,:]  # populate this with actual values

top_negative_features = np.argsort(linear_weights)[0:40]
top_positive_features = np.argsort(-linear_weights)[0:40]


In [11]:
print("Most negative features:")
for idx in top_negative_features:
    print("  {:s} ({:.02f})".format(vocab.id_to_word[idx], 
                                    linear_weights[idx]))
print("")
print("Most positive features:")
for idx in top_positive_features:
    print("  {:s} ({:.02f})".format(vocab.id_to_word[idx], 
                                    linear_weights[idx]))
    


Most negative features:
  🔥 (-5.02)
  ↴ (-4.66)
  ⭐ (-4.46)
  🕰 (-4.16)
  🎮 (-4.11)
  💎 (-4.07)
  🖥 (-4.05)
  registration (-4.02)
  tokkens (-3.98)
  cams (-3.98)
  giveaway (-3.94)
  token (-3.92)
  airdrop (-3.87)
  blockchain (-3.81)
  crypto (-3.71)
  rt (-3.68)
  enter (-3.66)
  telegram (-3.66)
  es (-3.66)
  summer (-3.51)
  ico (-3.48)
  latest (-3.46)
  📢 (-3.41)
  bitcoin (-3.30)
  camp (-3.26)
  ✔ (-3.18)
  digital (-3.14)
  🚀 (-3.14)
  easter (-3.14)
  international (-3.14)
  coin (-3.09)
  ➡ (-2.99)
  recommend (-2.99)
  DGDGDGDGDG (-2.95)
  💰 (-2.94)
  technology (-2.88)
  tokens (-2.88)
  fee (-2.82)
  usd (-2.82)
  DGDG:DGDG (-2.82)

Most positive features:
  straight (5.47)
  nicer (5.46)
  ex (5.46)
  couldnt (4.77)
  walking (4.76)
  agent (4.75)
  believes (4.07)
  sarcasm (3.94)
  happen (3.89)
  surprise (3.87)
  marr (3.72)
  previ (3.69)
  oh (3.57)
  guy (3.53)
  form (3.39)
  lol (3.15)
  🤣 (3.13)
  redbubble (3.00)
  wonder (3.00)
  quotes (3.00)
  shot (3.0

###### 

In [12]:
predictions = nb.predict(x_test_sparse)
wrong = predictions != test_labels
logits = nb.predict_log_proba(x_test_sparse)

def incorrect_confidence(wrong, logits, predictions):
    indeces = np.where(wrong)
    wrong_predictions = predictions[indeces]
    wrong_logits = logits[indeces]
    
    return [[wrong_logits[i][value] - wrong_logits[i][1-value], indeces[0][i]] for i, value in enumerate(wrong_predictions)]

sorted(incorrect_confidence(wrong, logits, predictions), key = lambda logit: -logit[0])



#[a] -  nb.predict_log_proba(x_test_sparse)[wrong][1-a]

[[22.88759193778526, 428],
 [19.719630660407091, 122],
 [17.705800035489261, 21],
 [17.647464772685197, 499],
 [17.079760841568572, 526],
 [15.607175595868057, 342],
 [15.239575608006248, 500],
 [13.465934798743831, 847],
 [12.911635022491254, 775],
 [12.731874485355803, 270],
 [12.731874485355803, 290],
 [12.613876708571979, 409],
 [10.59898655166063, 208],
 [10.563503535385053, 621],
 [10.169012968955258, 540],
 [9.8198183489286635, 410],
 [9.6347684301714764, 740],
 [9.2541991695932779, 406],
 [9.0379504096087544, 84],
 [8.9631887529136236, 115],
 [8.6155393906980748, 314],
 [8.5684120460569204, 858],
 [8.2805447054170713, 493],
 [7.9076596361223892, 585],
 [7.7955209563144194, 271],
 [7.7150847668723088, 395],
 [7.1347360482703266, 466],
 [7.1323109556846873, 53],
 [7.0786912380987133, 19],
 [6.7755616870452116, 450],
 [6.6448351452550867, 281],
 [6.3489216615514295, 855],
 [6.0588688585214641, 640],
 [5.8794771696242449, 473],
 [5.2927659784418779, 678],
 [5.2088330576929422, 423]

In [14]:
index = 428
print(test_sentences[index])
print(test_labels[index])
print(predictions[index])

['hangover' '!' '🍺' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG'
 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG'
 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG'
 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG'
 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' '💋' 'LINK']
1
0


In [15]:
index = 122
print(test_sentences[index])
print(test_labels[index])
print(predictions[index])

['this' 'one' 'enter' 'gan' '!' '!' '!' '-' '-' '-' 'follow' 'ACCOUNT' '-'
 '-' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG'
 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG'
 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG'
 'LINK']
1
0


In [16]:
index =21
print(test_sentences[index])
print(test_labels[index])
print(predictions[index])

['LINK' '…' '…' '…' '…' '…' '…' '…' '…' '…' '…' '…' '…' '…' '…' '…' '…' '…'
 '…' 'once' 'upon' 'a' 'time' '(' 'spin' ')' 'featuring' 'ACCOUNT' 'of'
 'd12' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG'
 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG'
 'HASHTAG' '#' 'hiphop' 'HASHTAG']
1
0


In [18]:
index = 270
print(test_sentences[index])
print(test_labels[index])
print(predictions[index])

['meme' 'war' '!' 'a' 'massive' 'collection' '.' 'LINK' 'HASHTAG' 'HASHTAG'
 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG'
 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG'
 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG']
1
0


In [20]:
index = 208
print(test_sentences[index])
print(test_labels[index])
print(predictions[index])

['wide' 'awake' 'for' 'once' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG'
 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG'
 'HASHTAG' 'HASHTAG' ':/' '/' 't.co/fuubiehpxu']
1
0


In [None]:
# No hashtag model has a big problem when a message is made up of many hashtags