In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except ValueError:
    strategy = tf.distribute.get_strategy() # for CPU and single GPU
    print('Number of replicas:', strategy.num_replicas_in_sync)

In [None]:
from transformers import BertTokenizer, TFBertModel, AutoTokenizer,TFAutoModel
import tensorflow as tf

In [None]:
# model_roBerta ='joeddav/xlm-roberta-large-xnli'
model_Bert = 'bert-base-multilingual-cased'
tokenizer = BertTokenizer.from_pretrained(model_Bert)
model = TFBertModel.from_pretrained(model_Bert)
model.summary()

In [None]:
train = pd.read_csv('../input/nlp-getting-started/train.csv')

In [None]:
train[10:]

In [None]:
x_train = pd.DataFrame(train, columns = {'id','text'})
#y_train = pd.DataFrame(train, columns = {'target'})
y_train = train[['target']]

In [None]:
import re
import string
def remove_noise(text):
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('ûò', '', text)
    return text
x_train.text = x_train.text.apply(lambda x: remove_noise(x))
x_train.head()

In [None]:
SEQ_LEN = 100 #len(x_train['text'])  # 236#max(train.astype('str').applymap(lambda x: len(x)).max())

def bert_encode(df, tokenizer):    
    batch_tweets = df['text'].tolist()
    
    tokens = tokenizer(batch_tweets, max_length = SEQ_LEN,
                   truncation=True, padding='max_length',
                   add_special_tokens=True, return_attention_mask=True,
                   return_token_type_ids=True, #only for BERT
                   return_tensors='tf')
    #tokens['input_ids'] = tf.reshape(tokens['input_ids'], [7613, 10, 10])
    #tokens['attention_mask'] = tf.reshape(tokens['attention_mask'], [7613, 10, 10])
    #tokens['token_type_ids'] = tf.reshape(tokens['token_type_ids'], [7613, 10, 10])
    inputs = {
          'input_ids': tokens['input_ids'], 
          'attention_mask': tokens['attention_mask'],
           'token_type_ids': tokens['token_type_ids']  
    } #  only for BERT
    return inputs

In [None]:
x_train_input = bert_encode(x_train, tokenizer)
x_train_input # = tf.reshape(x_train_input, [7613, 10, 10])

In [None]:
from tensorflow.keras import regularizers

def build_model():   # hp
    #FBertModel
    encoder = TFBertModel.from_pretrained(model_Bert)
    input_ids = tf.keras.Input(shape=([SEQ_LEN, ]), dtype=tf.int32, name = "input_ids")
    attention_mask = tf.keras.Input(shape=([SEQ_LEN, ]), dtype=tf.int32, name = "attention_mask")
    token_type_ids = tf.keras.Input(shape=([SEQ_LEN, ]), 
                                    dtype=tf.int32,  name = "token_type_ids") # only for BERT  
        
    embedding = encoder([input_ids, attention_mask, token_type_ids])[0] # [1] #  only for BERT
    print(embedding)
    inputs=[input_ids, attention_mask, token_type_ids] #   only for Bert
    hp_units1 = 128 # hp.Int('Inits1', min_value = 32, max_value = 512, step = 32)
    hp_units2 = 32 #hp.Int('Inits2', min_value = 32, max_value = 512, step = 32), kernel_regularizer=regularizers.l2(l2=1e-4)
    x = tf.keras.layers.Conv1D(32, 7, activation=tf.nn.relu)(embedding)
    x = tf.keras.layers.MaxPool1D(5)(x)
    x = tf.keras.layers.Conv1D(32, 5, activation=tf.nn.relu)(x)
    x = tf.keras.layers.GlobalMaxPool1D()(x) 
    #x = tf.keras.layers.Dense(units = hp_units2, activation=tf.nn.relu)(x)  #embedding[:,0,:]
    output = tf.keras.layers.Dense(2, activation='sigmoid')(x)
    model = tf.keras.Model(inputs=inputs, outputs=output)
    hp_learning_rate = 1e-6 # hp.Choise('learning_rate', values = [1e-4, 1e-5, 1e-6, 1e-7, 1e-8, 1e-9]) hp_learning_ratesparse_ sparse_categorical
    model.compile(tf.keras.optimizers.Adam(learning_rate = hp_learning_rate), loss='sparse_categorical_crossentropy', metrics=['accuracy'])   
    return model 

In [None]:
with strategy.scope(): # defines the compute distribution policy for building the model. or in other words: makes sure that the model is created on the TPU/GPU/CPU, depending on to what the Accelerator is set in the Notebook Settings
    model = build_model() # our model is being built
    model.summary()       # let's look at some of its properties

In [None]:
for key in x_train_input.keys():
    x_train_input[key] = x_train_input[key][:,:SEQ_LEN]

In [None]:
history = model.fit(x_train_input, y_train, epochs = 20, batch_size=128, 
                    validation_split = 0.2) 

In [None]:
import matplotlib.pyplot as plt
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(loss) + 1)
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
plt.clf() #Очистить рисунок
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
test = pd.read_csv('../input/nlp-getting-started/test.csv')
x_test = pd.DataFrame(test, columns = {'id','text'})
x_test.text = x_test.text.apply(lambda x: remove_noise(x))
x_test.head()
x_test_input = bert_encode(x_test, tokenizer)
x_test_input

In [None]:
for key in x_test_input.keys():
    x_test_input[key] = x_test_input[key][:,:SEQ_LEN]

In [None]:
predictions = [np.argmax(i) for i in model.predict(x_test_input)]

In [None]:
submission = test.id.copy().to_frame()
submission['target'] = predictions

In [None]:
submission.to_csv("./submission.csv", index = False)

In [None]:
submission[submission.target == 1]