In [None]:
!nvidia-smi

# Installing dependencies

In [None]:
!pip install transformers
!pip install text_hammer 

# Importing libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
import text_hammer as th


import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy,BinaryCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy,BinaryAccuracy
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.utils import plot_model

# Importing data

In [None]:
train=pd.read_csv('../input/nlp-getting-started/train.csv')
test=pd.read_csv('../input/nlp-getting-started/test.csv')

In [None]:
train.head()

In [None]:
train.shape

In [None]:
test.shape

In [None]:
train.isnull().sum()

In [None]:
train=train.fillna(" ")

In [None]:
train.isnull().sum()

In [None]:
train.target.value_counts()

In [None]:
sns.countplot(train.target)

In [None]:
# drop duplicate values
train=train.drop_duplicates('text',keep='last')

In [None]:
train.shape

In [None]:
plt.figure(figsize=(15,100))
sns.countplot(data=train, y='keyword' , hue='target')

# Text Preprocessing

In [None]:
def preprocess(text):
  text=str(text).lower()
  text=th.remove_urls(text)
  text=th.cont_exp(text)
  text=th.remove_emails(text)
  text=th.remove_html_tags(text)
  text=th.remove_special_chars(text)
  text=th.remove_accented_chars(text)
  text = str(text).lower().replace('\\', '').replace('_', ' ')
  return text

In [None]:
# train prerocessing
train.text=train.text.apply(preprocess)
train.keyword=train.keyword.apply(preprocess)

In [None]:
# test preprocessing
test.text=test.text.apply(preprocess)

In [None]:
# train.text=train.text+" "+train.keyword

In [None]:
y_train = train.target.values

In [None]:
train.text[0]

In [None]:
from transformers import AutoTokenizer,TFBertModel
tokenizer = AutoTokenizer.from_pretrained('bert-large-uncased')
bert = TFBertModel.from_pretrained('bert-large-uncased')

In [None]:
tokenizer(train.text[5])

In [None]:
train.text[1]

In [None]:
print("max len of tweets",max([len(x.split()) for x in train.text]))
max_length = 40

In [None]:
x_train = tokenizer(
    text=train.text.tolist(),
    add_special_tokens=True,
    max_length=40,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)

In [None]:
x_train['input_ids'].shape

# MODEL

In [None]:
max_len = 40


input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
input_mask = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")


embeddings = bert(input_ids,attention_mask = input_mask)[1]
out=tf.keras.layers.Dropout(0.07)(embeddings)
out = Dense(128, activation='relu')(embeddings)
out = tf.keras.layers.Dropout(0.1)(out)
out = Dense(32,activation = 'relu')(out)

y = Dense(1,activation = 'sigmoid')(out)

model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=y)
model.layers[2].trainable = True

In [None]:
model.summary()

In [None]:
optimizer = Adam(
    learning_rate=3e-5,
    epsilon=1e-08,
    decay=0.01,
    clipnorm=1.0)

# Set loss and metrics
loss = BinaryCrossentropy(from_logits = True)
metric = BinaryAccuracy('accuracy'),

# Compile the model
model.compile(
    optimizer = optimizer,
    loss = loss, 
    metrics = metric)

In [None]:
plot_model(model, show_shapes = True)

In [None]:
import tensorflow as tf
tf.config.experimental.list_physical_devices('GPU')

# Model training

In [None]:
train_history = model.fit(
    x ={'input_ids':x_train['input_ids'],'attention_mask':x_train['attention_mask']} ,
    y = y_train,
  epochs=2,
    batch_size=32
)

In [None]:
test = tokenizer(
    text=test.text.tolist(),
    add_special_tokens=True,
    max_length=40,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)

# Prediction

In [None]:
pred = model.predict({'input_ids':test['input_ids'],'attention_mask':test['attention_mask']}) 

In [None]:
y_predicted = np.where(pred>0.5,1,0)

In [None]:
y_predicted = y_predicted.reshape((1,3263))[0]
y_predicted

# Submission

In [None]:
submission=pd.read_csv('../input/nlp-getting-started/sample_submission.csv')
submission["target"]=y_predicted
submission.to_csv('submission.csv', index=False, header=True)