<a href="https://colab.research.google.com/github/sdeutchman/watson/blob/Akira/Watson_Elementary.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Import library 

In [1]:
! pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
! pip install nlp

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
import os
import gc
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import plotly.express as px

# NN
from tensorflow.keras.layers import Dense, Input, GlobalAveragePooling1D, GlobalMaxPooling1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
import tensorflow as tf
from transformers import TFAutoModel, AutoTokenizer
import nlp
from datasets import load_dataset

# Download data

In [5]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')
train
test
submission

Unnamed: 0,id,prediction
0,c6d58c3f69,1
1,cefcc82292,1
2,e98005252c,1
3,58518c10ba,1
4,c32b0d16df,1
...,...,...
5190,5f90dd59b0,1
5191,f357a04e86,1
5192,1f0ea92118,1
5193,0407b48afb,1


# Exploratory data analysis

In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12120 entries, 0 to 12119
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          12120 non-null  object
 1   premise     12120 non-null  object
 2   hypothesis  12120 non-null  object
 3   lang_abv    12120 non-null  object
 4   language    12120 non-null  object
 5   label       12120 non-null  int64 
dtypes: int64(1), object(5)
memory usage: 568.2+ KB


# Preparing data for input

In [7]:
!pip install tokenizers
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [8]:
def init_strategy():
    try:
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
        tf.config.experimental_connect_to_cluster(tpu)
        tf.tpu.experimental.initialize_tpu_system(tpu)
        strategy = tf.distribute.experimental.TPUStrategy(tpu)
        print("Init TPU strategy")
    except ValueError:
        strategy = tf.distribute.get_strategy() # for CPU and single GPU
        print("Init CPU/GPU strategy")
    return strategy

def build_model(model_name, maxlen, head="avg_pooling"):
    input_ids = Input(shape=(maxlen,), dtype=tf.int32, name="input_ids")
    encoder = TFAutoModel.from_pretrained(model_name)
    encoder_output = encoder(input_ids)[0]
    
    # convert transformer encoding to vector
    if head == "cls":
        features = encoder_output[:, 0, :] # using first token as encoder feature map
    elif head == "avg_pooling":
        features = GlobalAveragePooling1D()(encoder_output)
    elif head == "max_pooling":
        features = GlobalMaxPooling1D()(encoder_output)
    else:
        raise NotImplementedError
    
    # 3class softmax
    out = Dense(3, activation='softmax')(features)
    
    # define model
    model = Model(inputs=input_ids, outputs=out)
    model.compile(
        Adam(lr=1e-5), 
        loss='sparse_categorical_crossentropy', 
        metrics=['accuracy']
    )
    return model

def tokenize_dataframe(df, tokenizer, max_length):
    # tokenize
    text = df[['premise', 'hypothesis']].values.tolist()
    encoded = tokenizer.batch_encode_plus(text, padding=True, max_length=max_length, truncation=True)
    # features
    x = encoded['input_ids']
    # labels
    y = None
    if 'label' in df.columns:
        y = df.label.values
    return x, y

def build_dataset(x, y, mode, batch_size):
    if mode == "train":
        dataset = (
            tf.data.Dataset
            .from_tensor_slices((x, y))
            .repeat()
            .shuffle(2048)
            .batch(batch_size)
            .prefetch(auto)
        )
    elif mode == "valid":
        dataset = (
            tf.data.Dataset
            .from_tensor_slices((x, y))
            .batch(batch_size)
            .cache()
            .prefetch(auto)
        )
    elif mode == "test":
        dataset = (
            tf.data.Dataset
            .from_tensor_slices(x)
            .batch(batch_size)
        )
    else:
        raise NotImplementedError
    return dataset

def load_mnli(use_validation=True):
    result = []
    dataset = load_dataset('glue', 'mnli')
    keys = ['train', 'validation_matched','validation_mismatched'] if use_validation else ['train']
    for k in keys:
        for record in dataset[k]:
            c1, c2, c3 = record['premise'], record['hypothesis'], record['label']
            if c1 and c2 and c3 in {0,1,2}:
                result.append((c1,c2,c3,'en'))
    result = pd.DataFrame(result, columns=['premise','hypothesis','label','lang_abv'])
    return result

def load_xnli():
    result = []
    dataset = load_dataset('xnli','all_languages')
    for k in dataset.keys():
        for record in dataset[k]:
            hp, pr, lb = record['hypothesis'], record['premise'], record['label']
            if hp and pr and lb in {0,1,2}:
                for lang, translation in zip(hp['language'], hp['translation']):
                    pr_lang = pr.get(lang, None)
                    if pr_lang is None:
                        continue
                    result.append((pr_lang, translation, lb,lang))
    result = pd.DataFrame(result, columns=['premise','hypothesis','label','lang_abv'])
    return result

In [9]:
MODEL = 'jplu/tf-xlm-roberta-large'
MAXLEN = 120
strategy = init_strategy()
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
tokenizer = AutoTokenizer.from_pretrained(MODEL)
auto = tf.data.experimental.AUTOTUNE

def preprocess(df):
    return tokenize_dataframe(df, tokenizer, MAXLEN)

Init CPU/GPU strategy


In [25]:
BATCH_SIZE 

16

In [10]:
# load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')
# preprocess
x, y = preprocess(train)
x_test, _ = preprocess(test)
test_dataset = build_dataset(x_test, None, "test", BATCH_SIZE)

# load external datasets for interpretation purpose
mnli = load_mnli()
xnli = load_xnli()



  0%|          | 0/5 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]

In [11]:
import re
import string
punct = '[' + ''.join([c for c in string.punctuation if c != "'"]) + ']'

def preprocess_query(q):
    q = q.lower()
    q = re.sub(punct, ' ', q)
    q = re.sub('[ ]{2,}', ' ', q)
    return q

def search_in_base(q, kb):
    q = preprocess_query(q)
    return int(q in kb)

premises = pd.concat([train[['premise', 'lang_abv']], test[['premise', 'lang_abv']]])

In [12]:
knowledge_base = set(mnli['premise'].apply(preprocess_query))
premises['mnli'] = premises['premise'].apply(lambda q: search_in_base(q, knowledge_base))
print(f"fraction of train set english premises occurence in MNLI = {premises.loc[premises.lang_abv=='en', 'mnli'].mean() * 100}%")

fraction of train set english premises occurence in MNLI = 100.0%


In [13]:
# knowledge_base = set(xnli['premise'].apply(preprocess_query))
# premises['xnli'] = premises['premise'].apply(lambda q: search_in_base(q, knowledge_base))
# print(f"fraction of train set non-english premises occurence in XNLI = {premises.loc[premises.lang_abv!='en', 'xnli'].mean() * 100}%")

In [13]:
from google.colab import drive
drive.mount('/content/drive')
# save results 
strategy = init_strategy()
with strategy.scope():
    model = build_model(MODEL, MAXLEN)
    #model.load_weights("/content/drive/MyDrive/XLMR_mnlixnli_ep6.h5")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Init CPU/GPU strategy


Some layers from the model checkpoint at jplu/tf-xlm-roberta-large were not used when initializing TFXLMRobertaModel: ['lm_head']
- This IS expected if you are initializing TFXLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFXLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFXLMRobertaModel were initialized from the model checkpoint at jplu/tf-xlm-roberta-large.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLMRobertaModel for predictions without further training.


In [28]:
x_mnli, y_mnli = preprocess(mnli)
mnli_dataset = build_dataset(x_mnli, y_mnli, "train", BATCH_SIZE)
dataset = build_dataset(x, y, "valid", BATCH_SIZE)

In [30]:
n_steps = len(x_mnli) // BATCH_SIZE
train_history = model.fit(
    mnli_dataset,
    steps_per_epoch=n_steps,
    validation_data=dataset,
    epochs=2
)

Epoch 1/2




 1101/25771 [>.............................] - ETA: 1:17:31 - loss: 1.1467 - accuracy: 0.3448

KeyboardInterrupt: ignored

In [None]:
x_xnli, y_xnli = preprocess(xnli)
xnli_dataset = build_dataset(x_xnli, y_xnli, "train", BATCH_SIZE)


In [None]:
n_steps = len(x_xnli) // BATCH_SIZE
train_history = model.fit(
    xnli_dataset,
    steps_per_epoch=n_steps,
    validation_data=dataset,
    epochs=EPOCHS
)

In [20]:
    
dataset = build_dataset(x, y, "valid", BATCH_SIZE)
pr = np.argmax(model.predict(dataset), axis=1)
print(f"accuracy {accuracy_score(y, pr):.4f}")

test_preds = model.predict(test_dataset, verbose=0)
submission['prediction'] = test_preds.argmax(axis=1)
submission.to_csv('submission.csv', index=False)

array([0, 2, 0, ..., 2, 2, 0])

In [17]:
mnli

Unnamed: 0,premise,hypothesis,label,lang_abv
0,Conceptually cream skimming has two basic dime...,Product and geography are what make cream skim...,1,en
1,you know during the season and i guess at at y...,You lose the things to the following level if ...,0,en
2,One of our number will carry out your instruct...,A member of my team will execute your orders w...,0,en
3,How do you know? All this is their information...,This information belongs to them.,0,en
4,yeah i tell you what though if you go price so...,The tennis shoes have a range of prices.,1,en
...,...,...,...,...
412344,Do you watch that?,Can you see?,2,en
412345,"To a Western ear, the most predictable of lang...","To the Western ear, the least predictable of l...",2,en
412346,The recorder captured the sounds of loud thump...,The recorder didn't capture any of the sounds.,2,en
412347,That's a good attitude!,"You feel good about this, don't you?",1,en


In [None]:
# n_steps = len(x) // BATCH_SIZE
# train_history = model.fit(
#     dataset_train,
#     steps_per_epoch=n_steps,
#     validation_data=dataset,
#     epochs=24
# )