In [None]:
# https://www.kaggle.com/code/tanulsingh077/deep-learning-for-nlp-zero-to-transformers-bert/notebook

In [None]:
# %%bash
## Install miniconda
# mkdir -p $HOME/miniconda3
# wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O $HOME/miniconda3/miniconda.sh
# bash $HOME/miniconda3/miniconda.sh -b -u -p $HOME/miniconda3

# $HOME/miniconda3/bin/conda init bash

In [None]:
# %%bash
# # Set up conda environment and install dependencies from requirements.txt
# conda create -y \
#   --name dl-for-nlp \
#   --channel conda-forge \
#   --file conda-requirements.txt

# conda activate dl-for-nlp

In [None]:
# %%bash
## Set up GPU
## https://www.tensorflow.org/install/pip#linux

# conda install -y cudatoolkit=11.2 cudnn=8.1.0 
# export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib/

In [None]:
# %%bash
# # Install Python dependencies
# conda install -y jupyter \
#   kaggle \
#   numpy \
#   pandas \
#   plotly \
#   python-dotenv \
#   scikit-learn \
#   tensorflow \
#   tqdm

# conda list -e > conda-requirements.txt

In [None]:
# import tensorflow as tf
# print(tf.config.list_physical_devices('GPU'))

In [None]:
import numpy
import os
import pandas
import plotly
import tensorflow as tf
from dotenv import load_dotenv
from sklearn import model_selection
from tqdm import tqdm

envfile = '.env'
load_dotenv(envfile)

input_dir = os.getenv('INPUT_DIR', '.')
output_dir = os.getenv('OUTPUT_DIR', '.')

data_sample_size = int(os.getenv('DATA_SAMPLE_SIZE', 100))

In [None]:
try:
  tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
  print('Running on TPU ', tpu.master())
except ValueError:
  tpu = None

if tpu:
  tf.config.experimental_connect_to_cluster(tpu)
  tf.tpu.experimental.initialize_tpu_system(tpu)
  strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
  strategy = tf.distribute.get_strategy()

print('Replicas: ', strategy.num_replicas_in_sync)

In [None]:
train = pandas.read_csv(f'{input_dir}/jigsaw-toxic-comment-train.csv')
valid = pandas.read_csv(f'{input_dir}/validation.csv')
test = pandas.read_csv(f'{input_dir}/test.csv')

In [None]:
train.info()

In [None]:
drop_columns = [
  'identity_hate',
  'insult',
  'obscene',
  'severe_toxic',
  'threat'
]

train = train.drop(drop_columns, axis = 1)

In [None]:
contains_dropped_columns = train.columns.isin(drop_columns).any()
assert contains_dropped_columns == False, 'Dataframe contains dropped columns'

In [None]:
train = train.sample(n = data_sample_size)

In [None]:
xtrain, xvalid, ytrain, yvalid = model_selection.train_test_split(train.comment_text.values, train.toxic.values, 
                                    stratify = train.toxic.values, random_state = 2023, test_size = 0.2, shuffle = True)

In [None]:
comment_max_length = train.comment_text.str.split(' ').agg(len).max()

text_vectorizer = tf.keras.layers.TextVectorization(
  max_tokens = None,
  standardize = 'lower_and_strip_punctuation',
  split = 'whitespace',
  ngrams = None,
  output_mode = 'int',  # todo: compare with tf-idf
  output_sequence_length = int(comment_max_length),
  pad_to_max_tokens = False,
  vocabulary = None,
  idf_weights = None,
  sparse = False,
  ragged = False
)

In [None]:
text_vectorizer.adapt(numpy.concatenate([xtrain, xvalid]))

vocab_size = len(text_vectorizer.get_vocabulary())

In [None]:
with strategy.scope():
  model = tf.keras.Sequential()
  model.add(tf.keras.layers.Embedding(vocab_size, 300, input_length = comment_max_length))
  model.add(tf.keras.layers.SimpleRNN(100))
  model.add(tf.keras.layers.Dense(1, activation = 'sigmoid'))
  model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = [tf.keras.metrics.Accuracy()])
  model.summary()