In [25]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# used this tutorial to help. 
# https://www.tensorflow.org/tfmodels/nlp/fine_tune_bert
# https://www.tensorflow.org/text/tutorials/classify_text_with_bert
# https://www.tensorflow.org/tfmodels/nlp/fine_tune_bert#import_libraries

import numpy as np
import pandas as pd 
from sklearn.preprocessing import StandardScaler



import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import GlobalAveragePooling1D
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import MaxPooling1D
from tensorflow.keras.layers import GlobalAveragePooling1D
from tensorflow import keras
from tensorflow.keras import layers


from transformers import BertTokenizer, AutoTokenizer, AutoModelForSequenceClassification, TFBertModel, BertConfig

from tqdm import tqdm
# plots and images
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import Image

#sklearn processing
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

import os

# import wandb


# #configs
# max_tokens = 20000
# embed_dim  = 300
# num_heads  = 2
# dense_dim  = 32
# development_mode = True


BATCH_SIZE = 12
BUFFER_SIZE = 3200
SEQ_LEN = 512
EPOCHS = 25
AUTO = tf.data.AUTOTUNE

/kaggle/input/bertbasecased/config.json
/kaggle/input/bertbasecased/tokenizer.json
/kaggle/input/bertbasecased/tokenizer_config.json
/kaggle/input/bertbasecased/pytorch_model.bin
/kaggle/input/bertbasecased/vocab.txt
/kaggle/input/feedback-prize-english-language-learning/sample_submission.csv
/kaggle/input/feedback-prize-english-language-learning/train.csv
/kaggle/input/feedback-prize-english-language-learning/test.csv
/kaggle/input/glove/glove.6B.300d.txt


In [6]:
train = pd.read_csv('/kaggle/input/feedback-prize-english-language-learning/train.csv')
test = pd.read_csv('/kaggle/input/feedback-prize-english-language-learning/test.csv')


targets=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']

train_y = train[targets]

train_x = train['full_text']
test_x = test['full_text']

In [3]:
# process glove embeddings.
path_to_glove_file = '/kaggle/input/glove/glove.6B.300d.txt'

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep= " ")
        embeddings_index[word] = coefs

print(f'Found {len(embeddings_index)} word vectors')

Found 400000 word vectors


In [7]:
max_tokens = 20000
embed_dim  = 300
num_heads  = 2
dense_dim  = 32


text_vectorization = layers.TextVectorization(
    max_tokens=max_tokens,
    output_mode="int",
    output_sequence_length=6000,
)

#use the dataset to index the dataset vocab via the adapt method
text_vectorization.adapt(train_x)

train_full_x = text_vectorization(train_x)
test_full_x = text_vectorization(test_x)

2022-11-29 02:03:02.289721: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


In [8]:
vocabulary = text_vectorization.get_vocabulary() #get vocab from text_vectorization function
word_index = dict(zip(vocabulary, range(len(vocabulary)))) #use vocabulary to map from words to their index in the vocab

embedding_matrix = np.zeros((max_tokens, embed_dim))
for word, i in word_index.items():
    if i < max_tokens:
        embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [13]:
tokenizer = BertTokenizer.from_pretrained("/kaggle/input/bertbasecased/")

def get_ids_mask(inputs):
    input_ids = []
    attention_mask = []
    for x in tqdm(inputs):
        tokens = tokenizer(x, padding="max_length", truncation=True, max_length=SEQ_LEN, return_tensors="np")
        ids = tokens["input_ids"]
        mask = tokens["attention_mask"]
        input_ids.append(ids)
        attention_mask.append(mask)
    input_ids = np.array(input_ids).squeeze()
    attention_mask = np.array(attention_mask).squeeze()
    return input_ids, attention_mask


train_input_ids, train_attention_mask = get_ids_mask(train_x)
test_input_ids, test_attention_mask = get_ids_mask(test_x)


100%|██████████| 3911/3911 [00:36<00:00, 105.90it/s]
100%|██████████| 3/3 [00:00<00:00, 90.03it/s]


In [14]:

class MeanPool(keras.layers.Layer):
    def call(self, x, mask=None):
        broad_mask = tf.cast(tf.expand_dims(mask, -1), "float32")
        x = tf.math.reduce_sum( x * broad_mask, axis=1)
        x = x / tf.math.maximum(tf.reduce_sum(broad_mask, axis=1), tf.constant([1e-9]))
        return x 


In [15]:
config = BertConfig.from_pretrained("/kaggle/input/bertbasecased/") # need to change for kaggle
config.attention_probs_dropout_prob = 0.0
config.hidden_dropout_prob = 0.0
config

BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.0,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.20.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

In [17]:
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [layers.Dense(dense_dim, activation='relu'),
            layers.Dense(embed_dim),]
        )

        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
    
    def call(self, inputs, mask=None):
        if mask is not None:
            mask = mask[:, tf.newaxis, :]
        attention_output = self.attention(
            inputs, inputs, attention_mask=mask
        )
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)

    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "dense_dim": self.dense_dim
        })
        return config


tf.keras.backend.clear_session()
def build_model():
    # Multi inputs
    tokens = keras.Input(shape=(None,), dtype="int32", name="tokens")
    attention_mask = keras.Input(shape=(None,), dtype="int32", name="attention_mask")
    
    base_model = TFBertModel.from_pretrained("/kaggle/input/bertbasecased/",
                                             from_pt=True,
                                             config=config) # no dropout, maybe change this. 

    base_model.trainable = False  # Freeze bert model
    
    base_outputs = base_model.bert({"input_ids": tokens,
                              "attention_mask": attention_mask})

    last_hidden_state = base_outputs[0]
    
#     We will add Mean Pool instead of simply add a GlobalAveragePooling1D layer  
#     x = layers.GlobalAveragePooling1D()(base_outputs[0]) 
    x_bert = MeanPool()(last_hidden_state, mask=attention_mask)
    

    embedding_layer = layers.Embedding(
            max_tokens,
            embed_dim,
            embeddings_initializer=keras.initializers.Constant(embedding_matrix),
            trainable=False,
            mask_zero=True
        )
        

    tokens_full = keras.Input(shape=(None,), dtype="int64", name='tokens_full')

    embedded = embedding_layer(tokens_full)
    x_transformer =  TransformerEncoder(embed_dim, dense_dim, num_heads)(embedded)
    x_transformer = layers.GlobalAveragePooling1D()(x_transformer)
    x_transformer = layers.Dropout(0.5)(x_transformer)


    x = tf.keras.layers.Concatenate(axis=1)([x_bert, x_transformer])

    # Add three dense layers to do regression
    x = layers.Dense(768, activation='relu')(x)
    x = layers.Dense(512, activation='relu')(x)
    x = layers.Dense(256, activation="relu")(x)

    outputs = layers.Dense(6)(x)
    model = keras.Model([tokens, attention_mask, tokens_full] , outputs=outputs)
   
    model.compile(loss=keras.losses.MeanSquaredError(),
             optimizer=keras.optimizers.Adam(learning_rate=1e-3),
            metrics=keras.metrics.RootMeanSquaredError())
    
    return model

model = build_model()
model.summary()

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
tokens_full (InputLayer)        [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 300)    6000000     tokens_full[0][0]                
__________________________________________________________________________________________________
attention_mask (InputLayer)     [(None, None)]       0                                            
__________________________________________________________________________________________________
tokens (InputLayer)             [(None, None)]       0                                            
______________________________________________________________________________________________

In [18]:
history = model.fit(
            x={
                'tokens': train_input_ids,
                'attention_mask': train_attention_mask,
                'tokens_full': train_full_x
            },
            y =train_y,

            validation_split=None,
            epochs=EPOCHS,
            batch_size=6,
            )


Epoch 1/5


2022-11-29 02:14:16.839326: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [23]:
test_predictions = model.predict(x={
                'tokens': test_input_ids,
                'attention_mask': test_attention_mask,
                'tokens_full': test_full_x
            })

test_predictions = pd.DataFrame(test_predictions)
test['cohesion'] = test_predictions[0]
test['syntax'] = test_predictions[1]
test['vocabulary'] = test_predictions[2]
test['phraseology'] = test_predictions[3]
test['grammar'] = test_predictions[4]
test['conventions'] = test_predictions[5]
test

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,when a person has no experience on a job their...,2.88392,2.803073,2.953128,2.889484,2.650794,2.689518
1,000BAD50D026,Do you think students would benefit from being...,2.686886,2.557787,2.665262,2.489653,2.239902,2.637219
2,00367BB2546B,"Thomas Jefferson once states that ""it is wonde...",3.368145,3.288126,3.355875,3.32521,3.083086,3.307231


In [24]:
res = test.drop("full_text",axis=1)
res.to_csv("/kaggle/working/submission.csv",index=False)