In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import pandas as pd
import re

# Define the folders
folders = ['Dissertations', 'Master', 'Undergraduate']

# Initialize a counter for rows with 'IPB (Bogor Agricultural University)' in the 'title' column
ipb_count_total = 0

# Define the regex pattern for the 'title' column
title_pattern = r'^dc\.title'

# Function to extract columns based on regex pattern and select the first non-null value
def extract_columns(df, pattern):
    matching_columns = [col for col in df.columns if re.match(pattern, col)]
    if matching_columns:
        # Select the first non-null value across the matching columns
        return df[matching_columns].bfill(axis=1).iloc[:, 0]
    else:
        return pd.Series([None] * len(df))

# Loop through each folder
for folder in folders:
    # Loop through each CSV file in the folder
    for filename in os.listdir(f"/content/drive/MyDrive/Colab Notebooks/dataset_magang/{folder}/csv/"):
        if filename.endswith('.csv'):
            # Construct the file path
            file_path = os.path.join(f"/content/drive/MyDrive/Colab Notebooks/dataset_magang/{folder}/csv/", filename)

            # Read the CSV file into a DataFrame
            df = pd.read_csv(file_path)

            # Extract the 'title' column using the regex pattern
            title_column = extract_columns(df, title_pattern)

            # Filter the DataFrame for rows where 'title' is 'IPB (Bogor Agricultural University)'
            filtered_df = df[title_column == 'IPB (Bogor Agricultural University)']

            # Print the filtered DataFrame
            if not filtered_df.empty:
                print(f"Filtered rows from file: {filename}")
                print(filtered_df)

            # Count rows where 'title' contains 'IPB (Bogor Agricultural University)' within this file
            ipb_count = filtered_df.shape[0]
            ipb_count_total += ipb_count

# Print the total count of 'IPB (Bogor Agricultural University)' rows across all files
print(f"Total number of rows with 'IPB (Bogor Agricultural University)' in the 'title' column: {ipb_count_total}")


Filtered rows from file: DT - Economic and Management.csv
                                       id    collection dc.contributor.author  \
196  68ba90bc-a8c3-4129-8b27-a4989ffd1942  123456789/85                   NaN   
345  bfba38ff-b80b-44f9-bb36-53331e5084c6  123456789/85                   NaN   
382  d577713f-2f63-41b8-9e97-73c087db2041  123456789/85                   NaN   
414  e66ec644-5eb8-4cdf-8a54-7cbe01541007  123456789/85                   NaN   

       dc.contributor.author[] dc.date.updated[] dc.description.abstract  \
196      Nasution, H. Muslimin               NaN                     NaN   
345  Simanjuntak, Sahat Barita               NaN                     NaN   
382                 Tatuh, Jen               NaN                     NaN   
414               Saleh, Deddy               NaN                     NaN   

                             dc.description.abstract[]  \
196  Tujuan penelitian ialah untuk mengidentifikasi...   
345  The objectives of the study is to 

In [None]:
import os
import pandas as pd
import re

# Define the folders
folders = ['Dissertations', 'Master', 'Undergraduate']

# Initialize an empty DataFrame to hold the combined data
combined_df = pd.DataFrame()

# Define the regex patterns for column prefixes
patterns = {
    'title': r'^dc\.title',
    'abstract': r'^dc\.description\.abstract',
    'keyword': r'^dc\.subject\.keyword',
    'author': r'^dc\.contributor\.author',
    'type': r'^dc\.type',
    'url': r'^dc\.identifier\.uri',
}

# Function to extract columns based on regex pattern and select the first non-null value
def extract_columns(df, pattern):
    matching_columns = [col for col in df.columns if re.match(pattern, col)]
    if matching_columns:
        # Select the first non-null value across the matching columns
        return df[matching_columns].bfill(axis=1).iloc[:, 0]
    else:
        return pd.Series([None] * len(df))

# Function to truncate text to a specified number of words
def truncate_text(text, max_words=1000):
    if pd.isna(text):
        return text
    words = text.split()
    if len(words) > max_words:
        return ' '.join(words[:max_words]) + '...'
    return text

# Function to check the maximum word length in a column
def check_max_word_length(df, column_name):
    max_length = df[column_name].apply(lambda x: len(str(x).split()) if pd.notna(x) else 0).max()
    return max_length

# Loop through each folder
for folder in folders:
    # Loop through each CSV file in the folder
    for filename in os.listdir(f"/content/drive/MyDrive/Colab Notebooks/dataset_magang/{folder}/csv/"):
        if filename.endswith('.csv'):
            # Construct the file path
            file_path = os.path.join(f"/content/drive/MyDrive/Colab Notebooks/dataset_magang/{folder}/csv/", filename)

            # Read the CSV file into a DataFrame
            df = pd.read_csv(file_path)

            # Extract department name from the filename (assuming the filename contains the department)
            department_name = os.path.splitext(filename)[0]  # This will give you the filename without the extension

            # Create a simplified DataFrame using regex for column matching
            simplified_df = pd.DataFrame()

            # Apply regex-based column extraction for each key in patterns
            for key, pattern in patterns.items():
                simplified_df[key] = extract_columns(df, pattern)

            # Add a 'department' column to the DataFrame
            simplified_df['department'] = department_name

            # Fill 'type' column with folder name if it is NaN
            simplified_df['type'] = simplified_df['type'].fillna(folder)

            # Append the simplified DataFrame to the combined DataFrame
            combined_df = pd.concat([combined_df, simplified_df], ignore_index=True)

# Drop rows with all NaN values in the combined DataFrame
combined_df_clean = combined_df.dropna(how='all')

# Drop rows where the 'abstract' column is null
combined_df_clean = combined_df_clean.dropna(subset=['abstract'])

# Replace '||' with ', ' in specified columns
columns_to_replace = ['abstract', 'author', 'keyword', 'title', 'type']
for column in columns_to_replace:
    combined_df_clean[column] = combined_df_clean[column].str.replace('||', ', ', regex=False)

# Check max word length in 'abstract' column before truncation
max_length_before = check_max_word_length(combined_df_clean, 'abstract')
print(f"Maximum word length in 'abstract' before truncation: {max_length_before}")

# Truncate abstracts to 1000 words
combined_df_clean['abstract'] = combined_df_clean['abstract'].apply(truncate_text)

# Check max word length in 'abstract' column after truncation
max_length_after = check_max_word_length(combined_df_clean, 'abstract')
print(f"Maximum word length in 'abstract' after truncation: {max_length_after}")

# Save the combined and simplified DataFrame to a new CSV file
combined_df_clean.to_csv('df.csv', index=False)

print("Combined and simplified CSV created successfully with truncated abstracts.")

Maximum word length in 'abstract' before truncation: 4029
Maximum word length in 'abstract' after truncation: 1000
Combined and simplified CSV created successfully with truncated abstracts.


In [None]:
combined_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 115145 entries, 0 to 122976
Data columns (total 7 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   title       115145 non-null  object
 1   abstract    115145 non-null  object
 2   keyword     65311 non-null   object
 3   author      115071 non-null  object
 4   type        115145 non-null  object
 5   url         115144 non-null  object
 6   department  115145 non-null  object
dtypes: object(7)
memory usage: 7.0+ MB


In [None]:
combined_df_clean.head()

Unnamed: 0,title,abstract,keyword,author,type,url,department
0,"Comparing Visual Attention, AI-Reface, and Hum...",Understanding the differential impact of human...,"Intelligence, Brand, Endorser, Neuromarketing,...","Akbari, Deni Adha",Disertasi,http://repository.ipb.ac.id/handle/123456789/1...,DT - Magister Business
1,Rancang Bangun Model Pelayanan Perizinan Inves...,Investasi swasta merupakan bagian sumber pembi...,"analytical hierarchy process (AHP), soft syste...","Subowo, Eko",Dissertation,http://repository.ipb.ac.id/handle/123456789/1...,DT - Magister Business
2,Talent Management Strategy Development for Fut...,Kinerja berkelanjutan merupakan suatu kondisi ...,"Kinerja pegawai, Manajemen Talenta, employee p...","Hartono, Imam",Disertasi,http://repository.ipb.ac.id/handle/123456789/1...,DT - Magister Business
3,Development of Artificial Neural Network Model...,Berdasarkan Undang-Undang Nomor 10 tahun 1998 ...,"faktor internal, Jaringan Saraf Tiruan, intern...","Solihati, Garin Pratiwi",Disertasi,http://repository.ipb.ac.id/handle/123456789/1...,DT - Magister Business
4,"Kinerja Daya Saing, Efisiensi, Profitabilitas,...","RIRIS SHANTI. Kinerja Daya Saing, Efisiensi, P...","Bank Digital, daya saing, Efisiensi, profitabi...","Shanti, Riris",Disertasi,http://repository.ipb.ac.id/handle/123456789/1...,DT - Magister Business


In [None]:
df = pd.read_csv('/content/df.csv')

df['combined_text'] = df['title'] + ' ' + df['abstract']



print(len(df['combined_text'][74].split()))

382


In [None]:
pip install tensorflow transformers datasets scikit-learn

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [None]:
import tensorflow as tf
from transformers import TFAutoModel, AutoTokenizer
from datasets import load_dataset
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# 1. Load a model to finetune with TensorFlow (using BERT as an example)
model_name = "bert-base-uncased"  # You can replace this with other models like roberta, distilbert, etc.
model = TFAutoModel.from_pretrained(model_name)

# 2. Load the tokenizer (same one used for model)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 3. Load a dataset for training, validation, and testing
dataset = load_dataset("arulpm/triplet_large", "triplet")
train_dataset = dataset["train"]
eval_dataset = dataset["dev"]
test_dataset = dataset["test"]

# 4. Tokenize the text data (we'll tokenize anchor, positive, and negative examples)
def tokenize_function(examples):
    # Tokenize each of the triplet (anchor, positive, negative)
    anchor = tokenizer(examples['anchor'], padding="max_length", truncation=True, max_length=128)
    positive = tokenizer(examples['positive'], padding="max_length", truncation=True, max_length=128)
    negative = tokenizer(examples['negative'], padding="max_length", truncation=True, max_length=128)

    return {
        'input_ids': anchor['input_ids'],
        'attention_mask': anchor['attention_mask'],
        'positive_input_ids': positive['input_ids'],
        'positive_attention_mask': positive['attention_mask'],
        'negative_input_ids': negative['input_ids'],
        'negative_attention_mask': negative['attention_mask']
    }

train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# 5. Create TensorFlow Dataset from Hugging Face Dataset
def create_tf_dataset(dataset, batch_size=8):
    # Convert the dataset dictionary into a format suitable for TensorFlow
    inputs = {
        'input_ids': dataset['input_ids'],
        'attention_mask': dataset['attention_mask'],
        'positive_input_ids': dataset['positive_input_ids'],
        'positive_attention_mask': dataset['positive_attention_mask'],
        'negative_input_ids': dataset['negative_input_ids'],
        'negative_attention_mask': dataset['negative_attention_mask']
    }

    # Create a TensorFlow dataset
    return tf.data.Dataset.from_tensor_slices(inputs).batch(batch_size)

train_tf_dataset = create_tf_dataset(train_dataset)
eval_tf_dataset = create_tf_dataset(eval_dataset)
test_tf_dataset = create_tf_dataset(test_dataset)

# 6. Define a custom loss function (Multiple Negatives Ranking Loss)
def multiple_negatives_ranking_loss(anchor_embeddings, positive_embeddings, negative_embeddings):
    # Cosine similarity between anchor-positive and anchor-negative
    positive_similarity = tf.reduce_sum(tf.multiply(anchor_embeddings, positive_embeddings), axis=-1)
    negative_similarity = tf.reduce_sum(tf.multiply(anchor_embeddings, negative_embeddings), axis=-1)

    # Ranking loss (Max margin loss)
    loss = tf.reduce_mean(tf.maximum(0.0, 1.0 - positive_similarity + negative_similarity))
    return loss

# 7. Define a model class to compute sentence embeddings
class SentenceTransformerModel(tf.keras.Model):
    def __init__(self, model):
        super(SentenceTransformerModel, self).__init__()
        self.bert = model

    def call(self, input_ids, attention_mask):
        output = self.bert(input_ids, attention_mask=attention_mask)
        return output.pooler_output  # Use the pooled output for sentence embeddings

# Instantiate the model
sentence_model = SentenceTransformerModel(model)

# 8. Define optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)

# 9. Define a custom training loop
def train_step(batch):
    with tf.GradientTape() as tape:
        anchor_embeddings = sentence_model(batch['input_ids'], batch['attention_mask'])
        positive_embeddings = sentence_model(batch['positive_input_ids'], batch['positive_attention_mask'])
        negative_embeddings = sentence_model(batch['negative_input_ids'], batch['negative_attention_mask'])

        # Compute the multiple negatives ranking loss
        loss = multiple_negatives_ranking_loss(anchor_embeddings, positive_embeddings, negative_embeddings)

    gradients = tape.gradient(loss, sentence_model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, sentence_model.trainable_variables))
    return loss

# 10. Train the model
epochs = 3  # Set the number of epochs for training
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    total_loss = 0

    for batch in train_tf_dataset:
        loss = train_step(batch)
        total_loss += loss

    print(f"Training loss: {total_loss / len(train_tf_dataset)}")

    # Validation phase
    val_loss = 0
    for batch in eval_tf_dataset:
        anchor_embeddings = sentence_model(batch['input_ids'], batch['attention_mask'])
        positive_embeddings = sentence_model(batch['positive_input_ids'], batch['positive_attention_mask'])
        negative_embeddings = sentence_model(batch['negative_input_ids'], batch['negative_attention_mask'])

        loss = multiple_negatives_ranking_loss(anchor_embeddings, positive_embeddings, negative_embeddings)
        val_loss += loss

    print(f"Validation loss: {val_loss / len(eval_tf_dataset)}")

# 11. Save the trained model
sentence_model.save("tensorflow_sentence_model")

# 12. Perform inference (obtain sentence embeddings and calculate similarity)
def get_embeddings(texts):
    encoded = tokenizer(texts, padding=True, truncation=True, return_tensors="tf", max_length=128)
    embeddings = sentence_model(encoded['input_ids'], encoded['attention_mask'])
    return embeddings

# Example inference: Get embeddings for a sentence and calculate cosine similarity
anchor_texts = ["This is an anchor sentence."]
positive_texts = ["This is a similar sentence."]
negative_texts = ["This is a completely different sentence."]

anchor_embeddings = get_embeddings(anchor_texts)
positive_embeddings = get_embeddings(positive_texts)
negative_embeddings = get_embeddings(negative_texts)

# Calculate cosine similarity between the anchor and positive/negative embeddings
positive_similarity = cosine_similarity(anchor_embeddings, positive_embeddings)
negative_similarity = cosine_similarity(anchor_embeddings, negative_embeddings)

print("Positive similarity:", positive_similarity)
print("Negative similarity:", negative_similarity)

# 13. Test the model
test_embeddings = get_embeddings(test_dataset["anchor"])
print(test_embeddings)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/215 [00:00<?, ?B/s]

train.csv:   0%|          | 0.00/2.21G [00:00<?, ?B/s]

dev.csv:   0%|          | 0.00/196M [00:00<?, ?B/s]

test.csv:   0%|          | 0.00/49.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/414518 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/36846 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/9212 [00:00<?, ? examples/s]

Map:   0%|          | 0/414518 [00:00<?, ? examples/s]

Map:   0%|          | 0/36846 [00:00<?, ? examples/s]

Map:   0%|          | 0/9212 [00:00<?, ? examples/s]

Epoch 1/3


