In [None]:
import pandas as pd
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
vocab_size = 1000
embedding_dim = 16
max_length = 120
trunc_type = 'post'
padding_type = 'post'
oov_tok = "<OOV>"

In [None]:
data = pd.read_csv('/content/university_classification.csv')

sentences = data['pertanyaan']
university = data['universitas']

# Membersihkan data dari nilai NaN
university = university.dropna()

training_sentences, testing_sentences, train_university, test_university = train_test_split(sentences, university, test_size = 0.1, stratify = university)

In [None]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token = oov_tok)
tokenizer.fit_on_texts(training_sentences)

train_sequences = tokenizer.texts_to_sequences(training_sentences)
train_padded = pad_sequences(train_sequences, maxlen = max_length, padding = padding_type, truncating = trunc_type)

validation_sequences = tokenizer.texts_to_sequences(testing_sentences)
valid_padded = pad_sequences(validation_sequences, maxlen = max_length, padding = padding_type, truncating = trunc_type)

In [None]:
label_encoder = LabelEncoder()
label_encoder.fit(university)

train_label_encoded = label_encoder.transform(train_university)
test_label_encoded = label_encoder.transform(test_university)

num_classes = len(label_encoder.classes_)

train_label_final = to_categorical(train_label_encoded, num_classes=num_classes)
test_label_final = to_categorical(test_label_encoded, num_classes=num_classes)

In [None]:
# encoder_url = 'https://www.kaggle.com/models/tensorflow/bert/frameworks/TensorFlow2/variations/multi-cased-l-12-h-768-a-12/versions/4'
# preprocess_url = "https://www.kaggle.com/models/tensorflow/bert/frameworks/TensorFlow2/variations/multi-cased-preprocess/versions/3"
# embedding_model1 = hub.KerasLayer('encoder_url', trainable=True)
# preprocess_model1 = hub.KerasLayer(preprocess_url)

In [None]:
class MyCallback(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs = None):
    if logs.get('val_loss') < 0.60 and logs.get('val_accuracy') >= 0.85:
      self.model.stop_training = True

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(20, activation='softmax')
])

In [None]:
model.compile(
    loss = tf.keras.losses.CategoricalCrossentropy(),
    optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001),
    metrics = ['accuracy']
)

In [None]:
model.fit(
    train_padded,
    train_label_final,
    validation_data=(valid_padded, test_label_final),
    epochs=100,
    batch_size=16,
    callbacks=[MyCallback()]
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.src.callbacks.History at 0x7ea3a2c78d00>

In [None]:
# Evaluasi model menggunakan classification report
from sklearn.metrics import classification_report

predictions = model.predict(valid_padded)
predicted_classes = np.argmax(predictions, axis=1)
true_classes = np.argmax(test_label_final, axis=1)
class_labels = label_encoder.classes_

print(classification_report(true_classes, predicted_classes, target_names=class_labels))

                                     precision    recall  f1-score   support

           Institut Pertanian Bogor       1.00      1.00      1.00         2
         Institut Teknologi Bandung       1.00      1.00      1.00         3
Institut Teknologi Sepuluh Nopember       1.00      0.33      0.50         3
              Universitas Airlangga       0.50      0.67      0.57         3
               Universitas Atmajaya       1.00      0.67      0.80         3
         Universitas Bina Nusantara       1.00      1.00      1.00         3
              Universitas Brawijaya       0.50      1.00      0.67         2
             Universitas Diponegoro       1.00      1.00      1.00         2
            Universitas Gadjah Mada       1.00      1.00      1.00         2
             Universitas Hasanuddin       0.00      0.00      0.00         2
              Universitas Indonesia       1.00      1.00      1.00         3
        Universitas Islam Indonesia       1.00      1.00      1.00         

In [None]:
model.save('/content/drive/MyDrive/Colab Notebooks/model_university_classification')

In [None]:
model = tf.keras.models.load_model('/content/drive/MyDrive/Colab Notebooks/model_university_classification')

In [None]:
import numpy as np

# Input teks yang ingin Anda uji
input_text = ['Kapan Universitas Udayana didirikan?']

# Tokenisasi dan padding input
input_sequences = tokenizer.texts_to_sequences(input_text)
padded_input = pad_sequences(input_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Prediksi dengan model
predictions = model.predict(np.array(padded_input))  # Konversi list menjadi array NumPy

# Temukan indeks dengan nilai tertinggi
predicted_class_index = np.argmax(predictions)

# Dapatkan daftar kelas yang telah diencode
classes = label_encoder.classes_

# Konversi indeks ke label kelas
predicted_class_label = classes[predicted_class_index]

# Tampilkan hasil prediksi
print(f"Predicted class: {predicted_class_label}")

Predicted class: Universitas Udayana


In [None]:
model.save_weights('/content/drive/MyDrive/Colab Notebooks/university_classification.h5')