In [None]:
#Run on colab only
!pip install transformers

In [None]:
#Run on colab only
!pip install emoji --upgrade

In [None]:
#Run on colab only
#cd to folder which contain required external files
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Colab\ Notebooks/cs410/CourseProject 

In [None]:
import tensorflow as tf 
import tensorflow_hub as hub 
from tensorflow.keras import layers
from transformers import DistilBertTokenizer
from transformers import TFDistilBertForSequenceClassification
import sklearn
import numpy as np 
import pandas as pd 
import json
import re
import random
import math
from TEXT_PREPROCESSING import preprocess_text
from tqdm import tqdm

In [None]:
# LOADING DATA
categorized_tweets = pd.read_json('./data/train.jsonl', lines = True)
categorized_tweets.isnull().values.any()
print(categorized_tweets)

# PREPROCESSING DATA
tweets = []
data = list(categorized_tweets["response"])
print(data[0])
for d in data:
    tweets.append(preprocess_text(d))

y = categorized_tweets["label"]
y = np.array(list(map(lambda x: 1 if x=="SARCASM" else 0, y)))
print(tweets[9])

In [None]:
# TOKENIZING DATA
train_data, val_data, train_lab, val_lab = sklearn.model_selection.train_test_split(tweets, y, test_size=0.2, random_state=34)
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

train_tok = tokenizer(train_data, truncation=True, padding=True)
val_tok = tokenizer(val_data, truncation=True, padding=True)

In [None]:
# Create dataset object
train_data = tf.data.Dataset.from_tensor_slices((dict(train_tok),train_lab))
val_data = tf.data.Dataset.from_tensor_slices((dict(val_tok), val_lab))

In [None]:
# Build Model
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
opt = tf.keras.optimizers.Adam(learning_rate=.0001)
model.compile(loss=model.compute_loss, optimizer= opt, metrics=['accuracy'])
print(model.summary())

In [None]:
# Fit model
model.fit(train_data.batch(32), epochs=2, batch_size=32, validation_data=val_data.batch(32))

In [None]:
# Predict using model
uncat_tweets = pd.read_json('./data/test.jsonl', lines = True)
un_tweets = []
uncat_data = list(uncat_tweets["response"])

for d in uncat_data:
    un_tweets.append(preprocess_text(d))
    
tokenized_un_tweets = tokenizer(un_tweets, truncation=True, padding=True, return_tensors='tf')
predictions = tf.nn.softmax(model(tokenized_un_tweets)[0], axis=1).numpy()[:,1]
#print(predictions)

In [None]:
# Write predictions to file
with open('answer.txt', 'w') as f:
    c = 1
    s_c = 0
    ns_c = 0
    for p in predictions:
        if p >= .5:
            f.write("twitter_" + str(c) + "," + "SARCASM\n")
            c += 1
            s_c += 1
        else:
            f.write("twitter_" + str(c) + "," + "NOT_SARCASM\n")
            c += 1
            ns_c += 1
print("# sarcasm: " + str(s_c))
print("# not sarcasm: " + str(ns_c))
