In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import plot_model

from transformers import TFBertModel, BertConfig, BertTokenizerFast

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
data_train = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
data_test = pd.read_csv("../input/commonlitreadabilityprize/test.csv")

In [None]:
bert_model = '../input/bertmodel2'
tokenizer = BertTokenizerFast.from_pretrained('../input/bert-uncased')

In [None]:
def encoder(data_train, tokenizer, label = 'excerpt', maxLen = 210):
    input_id = []
    token_type = []
    attention_mask = []
    for i in data_train[label].values:
        token = tokenizer(i, max_length = maxLen, truncation = True, padding = 'max_length', add_special_tokens = True)
        input_id.append(token['input_ids'])
        token_type.append(token['token_type_ids'])
        attention_mask.append(token['attention_mask'])
    return np.array(input_id), np.array(token_type), np.array(attention_mask)

In [None]:
train_d = encoder(data_train, tokenizer)
test_d = encoder(data_test, tokenizer)

In [None]:
tok = train_d[0].flatten()[30:50]

print(train_d[0].shape, '\n')
print('original sequence:')
print(data_train['excerpt'][0].split()[30:50], '\n')
print('tokenised sequence:')
print(tokenizer.convert_ids_to_tokens(tok), '\n')
print(tokenizer.convert_ids_to_tokens(train_d[0].flatten()[0:10]))

In [None]:
# config will be downloaded and cached
model_config = BertConfig.from_pretrained(bert_model)

In [None]:
model_config.output_hidden_states = True

In [None]:
model_config

In [None]:
# Downloads the model 
bert = TFBertModel.from_pretrained(bert_model, config = model_config)

In [None]:
# tunable parameters
max_len = 210
learning_rate = 1e-4
epochs = 2
optimizer = Adam(learning_rate = learning_rate)
loss = 'mse'
metrics = [RootMeanSquaredError()]

In [None]:
input_ids_i = Input(shape = (max_len, ), dtype = tf.int32, name = 'input_ids')
token_type_ids_i = Input(shape = (max_len, ), dtype = tf.int32, name = 'token_type_ids')
attention_mask_i = Input(shape = (max_len, ), dtype = tf.int32, name = 'attention_mask')
inputs = [input_ids_i, token_type_ids_i, attention_mask_i]

bert_output = bert(input_ids_i, token_type_ids = token_type_ids_i, attention_mask = attention_mask_i)[0]
output = bert_output[:, 0, :]

output = Dropout(0.1)(output)

output = Dense(10, activation = 'sigmoid')(output)
output = Dense(1, activation = 'sigmoid')(output)

model = Model(inputs = inputs, outputs = output)

model.compile(loss = loss, optimizer = optimizer, metrics = metrics)

model.summary()

In [None]:
print(bert_output.shape)

In [None]:
norm = preprocessing.MinMaxScaler() 
data_train_norm = data_train['target'].values.reshape(-1,1)
norm.fit(data_train_norm)
data_train_norm = norm.transform(data_train_norm)
data_train_norm = pd.DataFrame(data_train_norm)
data_train_norm

In [None]:
train_l = data_train_norm

val_prob = 0.1
split = int(len(train_l)*(1 - val_prob))
print(split)
print(2834 - split)
train_x = tuple(np.array(train_d)[:, :split, :])
train_y = train_l[:split]

val_x = tuple(np.array(train_d)[:, split:, :])
val_y = np.array(train_l[split:])
print(val_x[0].shape)
val_y.shape

In [None]:
model.fit(train_x, train_y, validation_data = (val_x, val_y), epochs = epochs)

In [None]:
pred_norm = model.predict(test_d)
pred_norm

In [None]:
pred = norm.inverse_transform(pred_norm)
pred

In [None]:
data_test = data_test.drop(columns=['url_legal', 'license', 'excerpt'])#убираем ненужные стобцы

In [None]:
data_test['target'] = pred
data_test['target'] = round(data_test['target'],1)
data_test = data_test.set_index('id')
data_test.head()

In [None]:
data_to_submit = data_test
data_to_submit.to_csv('submission.csv', sep=',') 