In [1]:
import random
import os
from urllib import request

from dontpatronizeme.semeval_2022 import dont_patronize_me as dpm

import numpy as np
import pandas as pd

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, SpatialDropout1D, Bidirectional, Dropout

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("Hub version: ", hub.__version__)
print("GPU is", "available" if tf.test.is_gpu_available() else "NOT AVAILABLE")

Version:  2.7.0
Eager mode:  True
Hub version:  0.12.0
Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
GPU is available


2021-12-11 15:07:06.101249: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2021-12-11 15:07:06.101386: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Metal device set to: Apple M1 Max


In [2]:
# split data

In [3]:
# - <par_id> is a unique id for each one of the paragraphs in the corpus.
# - <art_id> is the document id in the original NOW corpus (News on Web: https://www.english-corpora.org/now/).
# - <keyword> is the search term used to retrieve texts about a target community.
# - <country_code> is a two-letter ISO Alpha-2 country code for the source media outlet.
# - <text> is the paragraph containing the keyword.
# - <label> is an integer between 0 and 4. Each paragraph has been annotated by two annotators as 0 (No PCL), 1 (borderline PCL) and 2 (contains PCL). The combined annotations have been used in the following graded scale:

data = pd.read_csv('dontpatronizeme_pcl.tsv', skiprows=4, sep='\t', header=None, index_col=0)
data.columns = ['art_id', 'keyword', 'country_code', 'text', 'label']
print(data.shape)
data.head(5)

(10469, 5)


Unnamed: 0_level_0,art_id,keyword,country_code,text,label
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,@@24942188,hopeless,ph,"We 're living in times of absolute insanity , ...",0
2,@@21968160,migrant,gh,"In Libya today , there are countless number of...",0
3,@@16584954,immigrant,ie,White House press secretary Sean Spicer said t...,0
4,@@7811231,disabled,nz,Council customers only signs would be displaye...,0
5,@@1494111,refugee,ca,""" Just like we received migrants fleeing El Sa...",0


In [4]:
def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data)*test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return train_indices, test_indices, data.iloc[train_indices], data.iloc[test_indices]

def split_train_test2(data):
    train = pd.read_csv('train_semeval_parids-labels.csv', index_col=0)
    test =  pd.read_csv('dev_semeval_parids-labels.csv', index_col=0)
    train_idx = train.index.values
    print(train_idx)
    test_idx = test.index.values
    print(test_idx)
    return train_idx, test_idx, data.iloc[train_idx-1], data.iloc[test_idx-1]


# which indices to use?
semeval_idx = True

np.random.seed(42)
if semeval_idx:
    # using semeval's train/devs set:
    train_indices, test_indices, train_set, test_set = split_train_test2(data)
else:
    # using custom randomizer
    train_indices, test_indices, train_set, test_set = split_train_test(data, 0.2)

# print(train_set.text.head(5), train_set.shape)
# print(test_set.head(5), test_set.shape)

train_path = 'pcl_train.tsv'
test_path = 'pcl_test.tsv'

with open(train_path, 'w') as f:
    f.write('\n'*4)
    train_set.to_csv(f, header=False, sep ='\t')

with open(test_path, 'w') as f:
    f.write('\n'*4)
    test_set.to_csv(f, header=False, sep ='\t', index=0)


[ 4341  4136 10352 ...  8382  8383  8384]
[ 4046  1279  8330 ... 10464 10465 10466]


In [5]:
from dont_patronize_me import DontPatronizeMe
# Initialize a dpm (Don't Patronize Me) object.
# It takes two arguments as input: 
# (1) Path to the directory containing the training set files, which is the root directory of this notebook.
# (2) Path to the test set, which will be released when the evaluation phase begins. In this example, 
# we use the dataset for Subtask 1, which the code will load without labels.
dpm = DontPatronizeMe('.', test_path)

In [6]:
# This method loads the subtask 1 data
dpm.load_task1()
# which we can then access as a dataframe
dpm.train_task1_df.head(10)

Unnamed: 0,par_id,art_id,keyword,country,text,label,orig_label
0,1,@@24942188,hopeless,ph,"we 're living in times of absolute insanity , ...",0,0
1,2,@@21968160,migrant,gh,"in libya today , there are countless number of...",0,0
2,3,@@16584954,immigrant,ie,"""white house press secretary sean spicer said ...",0,0
3,4,@@7811231,disabled,nz,council customers only signs would be displaye...,0,0
4,5,@@1494111,refugee,ca,""""""" just like we received migrants fleeing el ...",0,0
5,6,@@9382277,in-need,in,"to bring down high blood sugar levels , insuli...",0,0
6,7,@@7562079,refugee,za,the european union is making an historic mista...,0,0
7,8,@@23663488,hopeless,nz,""""""" they 're either hopeless for being beaten ...",0,0
8,9,@@3449225,homeless,ph,"nueva era , ilocos norte - no family shall be ...",0,1
9,10,@@2485090,in-need,nz,his spokesman said the kremlin needed more inf...,0,0


In [7]:

train_set = dpm.train_task1_df.iloc[train_indices-1]
test_set = dpm.train_task1_df.iloc[test_indices-1]

# training set
X_train = train_set.text.to_numpy()
y_train = train_set.label.to_numpy()
# print(X_train)
# print(y_train)

# test set
X_test = test_set.text.to_numpy()
y_test = test_set.label.to_numpy()
# print(X_test)
# print(y_test)

In [8]:
# helper function to save predictions to an output file
def labels2file(p, outf_path):
	with open(outf_path,'w') as outf:
		for pi in p:
			outf.write(','.join([str(k) for k in pi])+'\n')

In [9]:
# first, we need to create the res/ and ref/ folders, which the evaluator expects
!mkdir ref res

mkdir: ref: File exists
mkdir: res: File exists


In [10]:
dpm.load_test()

In [11]:
# dpm.test_set[:5]

In [12]:
# build model

In [13]:
model = "https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim-with-oov/1"
hub_layer = hub.KerasLayer(model, output_shape=[20], input_shape=[], 
                           dtype=tf.string, trainable=True)
hub_layer(X_train[:1])

2021-12-11 15:07:06.391183: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2021-12-11 15:07:06.391210: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2021-12-11 15:07:06.411623: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2021-12-11 15:07:06.421615: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-12-11 15:07:06.433190: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


<tf.Tensor: shape=(1, 20), dtype=float32, numpy=
array([[ 1.1180605 , -0.37061796, -0.12925693, -1.4085444 ,  1.6034535 ,
        -2.2361784 ,  0.89861727,  0.6513086 , -0.19338463,  0.79440534,
         0.972198  ,  0.2805785 , -2.545298  ,  0.04027791, -1.0220096 ,
         1.5522635 ,  0.244063  , -2.2539005 ,  0.14072226,  0.7685297 ]],
      dtype=float32)>

In [14]:
class myCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        if (logs.get('accuracy') >
                0.998):  # it actually never reaches this hight accuracy
            print("\nReached 99.9% accuracy so cancelling training!")
            self.model.stop_training = True

callbacks = myCallback()

model = tf.keras.Sequential()
model.add(hub_layer)

# drops 1D feature maps for independence
model.add(Dropout(0.1, input_shape=(20,)))

model.add(Dense(50, activation='relu'))

model.add(Dense(50, activation='relu'))

model.add(Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 keras_layer (KerasLayer)    (None, 20)                389380    
                                                                 
 dropout (Dropout)           (None, 20)                0         
                                                                 
 dense (Dense)               (None, 50)                1050      
                                                                 
 dense_1 (Dense)             (None, 50)                2550      
                                                                 
 dense_2 (Dense)             (None, 1)                 51        
                                                                 
Total params: 393,031
Trainable params: 393,031
Non-trainable params: 0
_________________________________________________________________


In [15]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [16]:
%%time

history = model.fit(X_train,
                    y_train,
                    epochs=20,
                    batch_size=32,
                    validation_data=(X_test, y_test),
                    verbose=1,
                    callbacks=[callbacks])

Epoch 1/20
  1/262 [..............................] - ETA: 1:25 - loss: 1.4061 - accuracy: 0.0625

2021-12-11 15:07:06.906466: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.




2021-12-11 15:07:10.918308: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
CPU times: user 1min 4s, sys: 30.6 s, total: 1min 34s
Wall time: 1min 20s


In [17]:
score = model.evaluate(X_test, y_test, verbose = 1) 

print('Test loss:', score[0]) 
print('Test accuracy:', score[1])

Test loss: 0.49092382192611694
Test accuracy: 0.9025787711143494


In [18]:
pred = model.predict(X_test) 
# print(pred)
# pred = np.argmax(pred, axis = 1)[:5] 
# label = np.argmax(y_test,axis = 1)[:5] 

# print(pred) 
# print(label)
pos = pred[pred >= 0.5]
neg = pred[pred < 0.5]
print(len(pos), len(neg))

2021-12-11 15:08:27.763281: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


143 1951


In [19]:
# relabel
pred[pred >= 0.5] = int(1)
pred[pred < 0.5] = int(0)
pred = pred.astype(int)

In [20]:
# save results to file
labels2file(pred, os.path.join('res/', 'task1.txt'))

In [21]:
# for subtask 1 (we convert our list of labels into a list of lists to make 
# it compatible with the labels2file function)
labels2file(test_set.label.apply(lambda x:[x]).tolist(), os.path.join('ref/', 'task1.txt'))

In [22]:
# Now, we can just call the official scorer, which takes an input_directory and an output_directory
# as arguments. In this example, both will be the root directory of this notebook.
!python3 evaluation.py . .

In [23]:
# The scorer generated a results file called "scores.txt". 
# We can now see the performance of a random baseline on the training set.
!cat scores.txt

task1_precision:0.4825174825174825
task1_recall:0.34673366834170855
task1_f1:0.40350877192982454


In [24]:
# the left pane should now show a file called submission.zip, which you can submit to Codalab
!cp 'res/task1.txt' 'task1.txt'
!zip submission.zip 'task1.txt'
! rm 'task1.txt'

updating: task1.txt (deflated 95%)
