In [None]:
#run if using colab
#uninstall if problem during bert tokenization then reinstall below
!pip uninstall bert-for-tf2

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install bert-for-tf2

In [None]:
!pip install sentencepiece

In [None]:
!pip install emoji --upgrade

In [None]:
#cd to folder which contain required external files, TEXT_MODEL.py and text_preprocessing.py
%cd /content/drive/MyDrive/Colab\ Notebooks/cs410/CourseProject 

In [None]:
import tensorflow as tf 
import tensorflow_hub as hub 
from tensorflow.keras import layers
import bert
import numpy as np 
import pandas as pd 
import json
import re
import random
import math
#from TEXT_MODEL import TEXT_MODEL
from TEXT_PREPROCESSING_01 import preprocess_text

In [None]:

# LOADING DATA
categorized_tweets = pd.read_json('./data/train.jsonl', lines = True)
categorized_tweets.isnull().values.any()
print(categorized_tweets)

# PREPROCESSING DATA
tweets = []
data = list(categorized_tweets["response"])
print(data[0])
for d in data:
    tweets.append(preprocess_text(d))

y = categorized_tweets["label"]
y = np.array(list(map(lambda x: 1 if x=="SARCASM" else 0, y)))



            label  ...                                            context
0         SARCASM  ...  [A minor child deserves privacy and should be ...
1         SARCASM  ...  [@USER @USER Why is he a loser ? He's just a P...
2         SARCASM  ...  [Donald J . Trump is guilty as charged . The e...
3         SARCASM  ...  [Jamie Raskin tanked Doug Collins . Collins lo...
4         SARCASM  ...  [Man ... y ’ all gone “ both sides ” the apoca...
...           ...  ...                                                ...
4995  NOT_SARCASM  ...  [@USER Apologies for the inconvenience you fac...
4996  NOT_SARCASM  ...  [@USER 🤔 idk tho , I think I ’ m #hungry . But...
4997  NOT_SARCASM  ...  [@USER @USER @USER Peace to you , and two coun...
4998  NOT_SARCASM  ...  [Bernie Sanders told Elizabeth Warren in priva...
4999  NOT_SARCASM  ...  [PDP PROTEST BRAINSTORMING SESSION Deji : We n...

[5000 rows x 3 columns]
@USER @USER @USER I don't get this .. obviously you do care or you would've moved right

In [None]:
# TOKENIZING DATA

BertTokenizer = bert.bert_tokenization.FullTokenizer

bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3", trainable=False)
vocabulary_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = BertTokenizer(vocabulary_file, to_lower_case)

def tokenize_tweets(data):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(data))
  ##vectorized tweet
tokenized_tweets = [tokenize_tweets(tweet) for tweet in tweets]  

# tokenized example
print(tweets[9])
print(tokenizer.tokenize(tweets[9]))


@USER @USER @USER responds to facts by tossing out frantic insults , then accuses others of being " triggered by facts " :rolling_on_the_floor_laughing: :face_with_tears_of_joy: :rolling_on_the_floor_laughing:
['@', 'user', '@', 'user', '@', 'user', 'responds', 'to', 'facts', 'by', 'tossing', 'out', 'frantic', 'insults', ',', 'then', 'accuse', '##s', 'others', 'of', 'being', '"', 'triggered', 'by', 'facts', '"', ':', 'rolling', '_', 'on', '_', 'the', '_', 'floor', '_', 'laughing', ':', ':', 'face', '_', 'with', '_', 'tears', '_', 'of', '_', 'joy', ':', ':', 'rolling', '_', 'on', '_', 'the', '_', 'floor', '_', 'laughing', ':']


In [None]:
#find the longest tweet in order to pad shorter tweets w zeros
maxlength = 0
for alist in tokenized_tweets:
    if len(alist) > maxlength:
        maxlength = len(alist)
        
maxlength   

120

In [None]:
#pad the input vector, tweets, so all observations have the same length
from keras.preprocessing.sequence import pad_sequences
tokenized_tweets_padded = pad_sequences(tokenized_tweets, maxlen=maxlength, padding = 'post')
tokenized_tweets_padded[0]

array([ 1030,  5310,  1030,  5310,  1030,  5310,  1045,  2123,  1005,
        1056,  2131,  2023,  1012,  1012,  5525,  2017,  2079,  2729,
        2030,  2017,  2052,  1005,  2310,  2333,  2157,  2247,  1012,
        1012,  2612,  2017,  2787,  2000,  2729,  1998, 18792,  2014,
        1012,  1012,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0], dtype=int32)

In [None]:
type(tokenized_tweets_padded)

numpy.ndarray

In [None]:
df_x = pd.DataFrame(tokenized_tweets_padded)
df_x.shape

(5000, 120)

In [None]:
df_x.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119
0,1030,5310,1030,5310,1030,5310,1045,2123,1005,1056,2131,2023,1012,1012,5525,2017,2079,2729,2030,2017,2052,1005,2310,2333,2157,2247,1012,1012,2612,2017,2787,2000,2729,1998,18792,2014,1012,1012,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1030,5310,1030,5310,2667,2000,6186,2055,1012,3331,2055,2032,1998,2010,10873,1998,2027,3830,3209,1059,24475,2515,2008,2191,7861,1029,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1030,5310,1030,5310,1030,5310,2002,3084,2019,9577,2055,1997,2769,2013,1996,5691,1010,15313,999,1001,4553,14406,24138,27268,6633,9316,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1030,5310,1030,5310,5564,8398,2180,1005,1056,2130,2713,2010,2938,7644,1998,2010,24249,12655,2056,2002,2001,1996,12873,4355,3076,2027,1005,2310,2412,4036,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1030,5310,1030,5310,3492,2469,1996,3424,1011,5367,4306,3555,2008,1000,7072,2001,2006,1996,10428,1000,1999,7313,1010,2205,1012,2027,2245,5367,2001,1000,27246,1000,1012,1001,2175,2361,1001,2283,11253,4115,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
print(len(tokenizer.vocab))

30522


In [None]:
#randomize and split into test and train datasets X,y
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_x, y, test_size = 0.20, shuffle = True, random_state = 33)

In [None]:
X_train.shape


(4000, 120)

In [None]:
input_length = X_train.shape[1]
VOCAB_LENGTH = len(tokenizer.vocab)
EMB_DIM = 200

model = tf.keras.Sequential()
model.add(layers.Embedding(VOCAB_LENGTH, EMB_DIM, input_length=input_length))
model.add(layers.Conv1D(filters=32, kernel_size=8, activation='relu'))
model.add(layers.MaxPooling1D(pool_size=2))
model.add(layers.Flatten())

#model.add(layers.Dense(128, activation='relu'))


model.add(layers.Dense(128))
model.add(layers.Dropout(rate= 0.5))
model.add(layers.Activation('relu'))


#model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(64))
model.add(layers.Dropout(rate= 0.5))
model.add(layers.Activation('relu'))


#model.add(layers.Dense(28, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
print(model.summary())



Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 120, 200)          6104400   
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 113, 32)           51232     
_________________________________________________________________
max_pooling1d_8 (MaxPooling1 (None, 56, 32)            0         
_________________________________________________________________
flatten_8 (Flatten)          (None, 1792)              0         
_________________________________________________________________
dense_98 (Dense)             (None, 128)               229504    
_________________________________________________________________
dropout_46 (Dropout)         (None, 128)               0         
_________________________________________________________________
activation_47 (Activation)   (None, 128)              

In [None]:
from tensorflow_addons.optimizers import AdamW
import tensorflow_addons as tfa
step = tf.Variable(0, trainable=False)
schedule = tf.optimizers.schedules.PiecewiseConstantDecay(
    [10000, 15000], [1e-0, 1e-1, 1e-2])
# lr and wd can be a function or a tensor
lr = 1e-1 * schedule(step)
wd = lambda: 1e-4 * schedule(step)

# ...

optimizer = tfa.optimizers.AdamW(learning_rate=lr, weight_decay=wd)

In [None]:
# compile network
from tensorflow.keras import optimizers

opt = optimizers.Adam(learning_rate=.00001, beta_1=0.9, beta_2=0.999, amsgrad=False)
model.compile(loss='binary_crossentropy', optimizer= opt, metrics=['accuracy'])
# fit network
model.fit(X_train, y_train, batch_size = 32, validation_data = (X_test, y_test), epochs=10, verbose=1)
# evaluate
loss, acc = model.evaluate(X_test, y_test, verbose=1)
print('Test Accuracy: %f' % (acc*100))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 78.100002


In [None]:
from sklearn.metrics import precision_recall_fscore_support
y_prediction = model.predict(X_test)
y_pred = []
for i in y_prediction:
    if i >= 0.5:
        y_pred.append(1)
    else:
        y_pred.append(0)    


p,r,f,n = precision_recall_fscore_support(y_test, y_pred, average='macro')
print('p = ',p, ', r = ', r, ', f = ', f)



p =  0.7811278749439426 , r =  0.7808680868086808 , f =  0.78090337838987


In [None]:
#use if you want to save the model
#model.save("/content/drive/My Drive/Colab Notebooks/cs410/network02c")

In [None]:
# Predict using model
uncat_tweets = pd.read_json('./data/test.jsonl', lines = True)
un_tweets = []
uncat_data = list(uncat_tweets["response"])

for d in uncat_data:
    un_tweets.append(preprocess_text(d))
tokenized_un_tweets = [tokenize_tweets(tweet) for tweet in un_tweets]
print(str(len(un_tweets)))

In [None]:
#perform check of input lengths between test and train data
count = 0
for alist in tokenized_un_tweets:
    if len(alist) > count:
        count = len(alist)
if maxlength < count:
    print('error: input of test data input len greater than train data- need to fix')
else:
    print("ok to proceed")    

In [None]:
tokenized_untweets_padded = pad_sequences(tokenized_un_tweets, maxlen=maxlength, padding = 'post')
tokenized_untweets_padded[0]

In [None]:


predictions = model.predict(tokenized_untweets_padded)

with open('answer.txt', 'w') as f:
    c = 1
    s_c = 0
    ns_c = 0
    for p in predictions:
        if p >= .5:
            f.write("twitter_" + str(c) + "," + "SARCASM\n")
            c += 1
            s_c += 1
        else:
            f.write("twitter_" + str(c) + "," + "NOT_SARCASM\n")
            c += 1
            ns_c += 1
print("# sarcasm: " + str(s_c))
print("# not sarcasm: " + str(ns_c))


In [None]:
! git config --global user.email "susc@colorado.edu"
! git config --global user.name "steve303"

In [None]:
!git stash

In [None]:
!git add answer.txt


In [None]:
!git commit -m "network_04a"
