# **Model 1: Hugging Face**

In [None]:
# importing general libraries
import pandas as pd
import numpy as np
import sklearn
import warnings, gc
warnings.filterwarnings("ignore")

# Tensorflow
import tensorflow as tf

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
# imporing data
train_sm_df = pd.read_json("/content/drive/My Drive/train_extra.json")
test_df=pd.read_json("/content/drive/My Drive/embold_test.json")

In [None]:
# Modifying data according to hugging face input
#Input Format  --  id  label  alpha(throw-away)  text
# remove newline characters to make more robust

df_bert = pd.DataFrame({
    'id':range(len(train_sm_df)),
    'label':train_sm_df['label'],
    'alpha':['a']*train_sm_df.shape[0],
    'text': train_sm_df['text'].replace(r'\n', ' ', regex=True)
})

df_bert.columns = range(df_bert.shape[1])

# Splitting training data file into *train* and *dev*
df_bert_train, df_bert_dev = train_test_split(df_bert, test_size=0.3)

df_bert_train.head()

Unnamed: 0,0,1,2,3
62292,62292,0,a,abort on file changed warning - nvim --versio...
54956,54956,1,a,add test directory for vscraper idea\r \r add ...
68768,68768,0,a,torch.cuda.current_device is always 0 at bac...
9172,9172,0,a,escape orm for sql injections need to rewrite ...
35190,35190,0,a,routeutils getdistancetostep not measuring cor...


In [None]:
# Modifying Test Data according to format above

df_bert_test = pd.DataFrame({
    'id':range(len(test_df)),
    'text': test_df['text'].replace(r'\n', ' ', regex=True)
})
df_bert_test.columns = range(df_bert_test.shape[1])
df_bert_test.head()

Unnamed: 0,0,1
0,0,config question path-specific environment var...
1,1,crash indien vol de simulator crasht als hij v...
2,2,"unable to mine rocks sarkasmo starting today, ..."
3,3,not all whitelists are processed create follow...
4,4,add ctx menu for idafree 70 and idafree 5 asso...


In [None]:
# making directory to store input/output data
mkdir data
mkdir bert_output

In [None]:
# Saving dataframes to .tsv format as required by BERT
df_bert_train.to_csv('data/train.tsv', sep='\t', index=False, header=False)
df_bert_dev.to_csv('data/dev.tsv', sep='\t', index=False, header=False)
df_bert_test.to_csv('data/test.tsv', sep='\t', index=False, header=False)

In [None]:
# Downloading the model

!git clone https://github.com/google-research/bert.git

Cloning into 'bert'...
remote: Enumerating objects: 340, done.[K
remote: Total 340 (delta 0), reused 0 (delta 0), pack-reused 340[K
Receiving objects: 100% (340/340), 317.85 KiB | 7.39 MiB/s, done.
Resolving deltas: 100% (185/185), done.


In [None]:
# unpacking the pre-trained model 
!wget https://storage.googleapis.com/bert_models/2018_10_18/cased_L-12_H-768_A-12.zip
!unzip the file
!unzip cased_L-12_H-768_A-12.zip

--2020-10-17 01:27:32--  https://storage.googleapis.com/bert_models/2018_10_18/cased_L-12_H-768_A-12.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 173.194.213.128, 173.194.214.128, 173.194.216.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|173.194.213.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 404261442 (386M) [application/zip]
Saving to: ‘cased_L-12_H-768_A-12.zip’


2020-10-17 01:27:34 (160 MB/s) - ‘cased_L-12_H-768_A-12.zip’ saved [404261442/404261442]

unzip:  cannot find or open the, the.zip or the.ZIP.
Archive:  cased_L-12_H-768_A-12.zip
   creating: cased_L-12_H-768_A-12/
  inflating: cased_L-12_H-768_A-12/bert_model.ckpt.meta  
  inflating: cased_L-12_H-768_A-12/bert_model.ckpt.data-00000-of-00001  
  inflating: cased_L-12_H-768_A-12/vocab.txt  
  inflating: cased_L-12_H-768_A-12/bert_model.ckpt.index  
  inflating: cased_L-12_H-768_A-12/bert_config.json  


In [None]:
# please install this version to make model work fine 
# with latest version it give error for some people

pip install tensorflow==1.15.2

In [None]:
# running the command line commands
# parameters that can be tuned --> 
# max_seq_length , learning_rate , train_batch_size , num_train_epochs

!python bert/run_classifier.py \
--task_name=cola \
--do_train=true \
--do_eval=true \
--do_predict=true \
--data_dir=./data/ \
--vocab_file=./cased_L-12_H-768_A-12/vocab.txt \
--bert_config_file=./cased_L-12_H-768_A-12/bert_config.json \
--init_checkpoint=./cased_L-12_H-768_A-12/bert_model.ckpt \
--max_seq_length=100 \
--train_batch_size=8 \
--learning_rate=1e-3 \
--num_train_epochs=1.0 \
--do_lower_case=False \
--output_dir=./bert_output/ \
--save_checkpoints_steps=9999999 

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
I1017 09:27:11.383001 139782014523264 tpu_estimator.py:2307] global_step/sec: 0.0875963
INFO:tensorflow:examples/sec: 0.700771
I1017 09:27:11.383543 139782014523264 tpu_estimator.py:2308] examples/sec: 0.700771
INFO:tensorflow:global_step/sec: 0.0877112
I1017 09:27:22.784008 139782014523264 tpu_estimator.py:2307] global_step/sec: 0.0877112
INFO:tensorflow:examples/sec: 0.70169
I1017 09:27:22.784525 139782014523264 tpu_estimator.py:2308] examples/sec: 0.70169
INFO:tensorflow:global_step/sec: 0.0877326
I1017 09:27:34.182281 139782014523264 tpu_estimator.py:2307] global_step/sec: 0.0877326
INFO:tensorflow:examples/sec: 0.701861
I1017 09:27:34.182969 139782014523264 tpu_estimator.py:2308] examples/sec: 0.701861
INFO:tensorflow:global_step/sec: 0.0878135
I1017 09:27:45.570097 139782014523264 tpu_estimator.py:2307] global_step/sec: 0.0878135
INFO:tensorflow:examples/sec: 0.702508
I1017 09:27:45.570622 139782014523264 tpu_estima