In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import os
import matplotlib.pyplot as plt

from transformers import BertTokenizer, TFAlbertModel, TFAutoModel
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import Adam
from imblearn.under_sampling import RandomUnderSampler
from tensorflow.keras.layers import Input, Dense

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = 'indobenchmark/indobert-lite-base-p1'
# model_name = 'bert-base-cased'

# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels = 2)

tokenizer = BertTokenizer.from_pretrained(model_name)
indobert_lite = TFAutoModel.from_pretrained(model_name)
model = tf.keras.models.load_model('models/model_2023-11-29', custom_objects={"TFAlbertModel": TFAlbertModel})


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'AlbertTokenizer'. 
The class this function is called from is 'BertTokenizer'.
Some layers from the model checkpoint at indobenchmark/indobert-lite-base-p1 were not used when initializing TFAlbertModel: ['sop_classifier', 'predictions']
- This IS expected if you are initializing TFAlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFAlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFAlbertModel were initialized from the model checkpoint at indobenchmark/

In [3]:
cwd = os.getcwd()
datapath = os.path.join(cwd, 'datasets')

df = pd.read_excel(os.path.join(datapath, 'all_cleaned.xlsx'))
df = df[['berita', 'label']]

df.drop_duplicates(inplace=True)
df.dropna(inplace = True)

df.label.value_counts()

label
0    20944
1     6474
Name: count, dtype: int64

In [4]:
X = df[['berita']]
y = df['label']
rus = RandomUnderSampler(random_state=1, replacement=True)# fit predictor and target variable
X_new, y_new = rus.fit_resample(X,y)
y_new.value_counts()

label
0    6474
1    6474
Name: count, dtype: int64

In [5]:
df_new = X_new
df_new['label'] = y_new
df_new

Unnamed: 0,berita,label
235,psi sebut banding pecat viani limardi tolak ad...,0
12172,tawar koalisi gerindra pks bilang cari teman l...,0
5192,dpr perintah kpu sepakat honor tugas tps naik ...,0
17290,megawati sebut bicara koalisi capres lihat din...,0
10955,dana milu capai rp triliun jokowi minta detail...,0
...,...,...
27427,raja salman arab saudi bawa orang orang sudah ...,1
27428,hehe selalu senyum lihat tingkah laku pak joko...,1
27429,pak jokowi jadi walikota periode pertama solo ...,1
27430,hari rabu nilai tukar rupiah puruk hingga semp...,1


In [6]:
df_train, df_test = train_test_split(df_new, test_size=0.2, random_state=42,
                                     stratify=df_new['label'])

In [9]:
indobert_lite.summary()

Model: "tf_albert_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 albert (TFAlbertMainLayer)  multiple                  11683584  
                                                                 
Total params: 11683584 (44.57 MB)
Trainable params: 11683584 (44.57 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [11]:
indobert_lite.layers[0].embedding

AttributeError: 'TFAlbertMainLayer' object has no attribute 'embedding'

In [None]:
import tensorflow_model_optimization as tfmot
import re

from tensorflow_model_optimization.sparsity import keras as sparsity

pruning_schedule = sparsity.PolynomialDecay(
                        initial_sparsity=0.0, final_sparsity=0.5,
                        begin_step=0, end_step=12, frequency=100)

pruned_model = tf.keras.Sequential()

for layer in indobert_lite.layers:
    if(re.match(r"conv_pw_\d+$", layer.name)):
         pruned_model.add(sparsity.prune_low_magnitude(
            layer,
            pruning_schedule,
            block_size=(1,1)
         ))
    else:
        pruned_model.add(layer)

In [7]:
max_len = 70

X_train = tokenizer(
    text=df_train['berita'].tolist(),
    add_special_tokens=True,
    max_length=max_len,
    truncation=True,
    padding=True,
    return_tensors='tf',
    return_token_type_ids=False,
    return_attention_mask=True,
    verbose=True
)

X_test = tokenizer(
    text=df_test['berita'].tolist(),
    add_special_tokens=True,
    max_length=max_len,
    truncation=True,
    padding=True,
    return_tensors='tf',
    return_token_type_ids=False,
    return_attention_mask=True,
    verbose=True
)

KeyboardInterrupt: 