In [1]:
import pandas as pd
import tensorflow as tf
import os

try:
    from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

except:
    !pip install transformers
    from transformers import AutoTokenizer, TFAutoModelForSequenceClassification


# Clone GH for access to data
!git clone https://github.com/pvankatwyk/fake-reviews-classification

# Move model.py file to root directory so you can call using "from model import RobertaModel"
try:
    os.rename("/content/fake-reviews-classification/our code/model.py", "/content/model.py")
except FileNotFoundError:
    pass

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 5.6 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 6.9 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 40.6 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 55.8 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 5.9 MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for 

In [2]:
encoded_label_dict = {"CG" : 0, "OR" : 1}
def encode_label(x):
    return encoded_label_dict.get(x,-1)

df = pd.read_csv("/content/fake-reviews-classification/data/fake_reviews_dataset.csv")
df["target"] = df["label"].apply(lambda x: encode_label(x))

In [3]:
model_name = "roberta-base"
MAX_LEN = 256
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 8
EPOCHS = 1
LEARNING_RATE = 1e-05

In [4]:
texts = list(df['text_'])
labels = list(df['target'])

In [5]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base") #Tokenizer
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors='tf') #Tokenized text

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [6]:
dataset=tf.data.Dataset.from_tensor_slices((dict(inputs), labels)) #Create a tensorflow dataset
#train test split, we use 10% of the data for validation
val_data_size=int(0.1*len(dataset))
val_ds=dataset.take(val_data_size).batch(TRAIN_BATCH_SIZE, drop_remainder=True) 
train_ds=dataset.skip(val_data_size).batch(TRAIN_BATCH_SIZE, drop_remainder=True)

In [7]:
n_categories = 2

In [None]:
model = TFAutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=n_categories)
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5, clipnorm=1.),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[tf.metrics.SparseCategoricalAccuracy(),tf.keras.metrics.SparseTopKCategoricalAccuracy(k=3, name='Sparse_Top_3_Categorical_Accuracy')],
    )
 
h=model.fit(train_ds, validation_data=val_ds, epochs=EPOCHS, verbose=1)

model.save_weights('./saved_weights.h5')

Downloading:   0%|          | 0.00/627M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.




In [None]:
h