In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import pandas as pd
import re

# Define the folders
folders = ['Dissertations', 'Master', 'Undergraduate']

# Initialize a counter for rows with 'IPB (Bogor Agricultural University)' in the 'title' column
ipb_count_total = 0

# Define the regex pattern for the 'title' column
title_pattern = r'^dc\.title'

# Function to extract columns based on regex pattern and select the first non-null value
def extract_columns(df, pattern):
    matching_columns = [col for col in df.columns if re.match(pattern, col)]
    if matching_columns:
        # Select the first non-null value across the matching columns
        return df[matching_columns].bfill(axis=1).iloc[:, 0]
    else:
        return pd.Series([None] * len(df))

# Loop through each folder
for folder in folders:
    # Loop through each CSV file in the folder
    for filename in os.listdir(f"/content/drive/MyDrive/Colab Notebooks/dataset_magang/{folder}/csv/"):
        if filename.endswith('.csv'):
            # Construct the file path
            file_path = os.path.join(f"/content/drive/MyDrive/Colab Notebooks/dataset_magang/{folder}/csv/", filename)

            # Read the CSV file into a DataFrame
            df = pd.read_csv(file_path)

            # Extract the 'title' column using the regex pattern
            title_column = extract_columns(df, title_pattern)

            # Filter the DataFrame for rows where 'title' is 'IPB (Bogor Agricultural University)'
            filtered_df = df[title_column == 'IPB (Bogor Agricultural University)']

            # Print the filtered DataFrame
            if not filtered_df.empty:
                print(f"Filtered rows from file: {filename}")
                print(filtered_df)

            # Count rows where 'title' contains 'IPB (Bogor Agricultural University)' within this file
            ipb_count = filtered_df.shape[0]
            ipb_count_total += ipb_count

# Print the total count of 'IPB (Bogor Agricultural University)' rows across all files
print(f"Total number of rows with 'IPB (Bogor Agricultural University)' in the 'title' column: {ipb_count_total}")


Filtered rows from file: DT - Economic and Management.csv
                                       id    collection dc.contributor.author  \
196  68ba90bc-a8c3-4129-8b27-a4989ffd1942  123456789/85                   NaN   
345  bfba38ff-b80b-44f9-bb36-53331e5084c6  123456789/85                   NaN   
382  d577713f-2f63-41b8-9e97-73c087db2041  123456789/85                   NaN   
414  e66ec644-5eb8-4cdf-8a54-7cbe01541007  123456789/85                   NaN   

       dc.contributor.author[] dc.date.updated[] dc.description.abstract  \
196      Nasution, H. Muslimin               NaN                     NaN   
345  Simanjuntak, Sahat Barita               NaN                     NaN   
382                 Tatuh, Jen               NaN                     NaN   
414               Saleh, Deddy               NaN                     NaN   

                             dc.description.abstract[]  \
196  Tujuan penelitian ialah untuk mengidentifikasi...   
345  The objectives of the study is to 

In [3]:
import os
import pandas as pd
import re

# Define the folders
folders = ['Dissertations', 'Master', 'Undergraduate']

# Initialize an empty DataFrame to hold the combined data
combined_df = pd.DataFrame()

# Define the regex patterns for column prefixes
patterns = {
    'title': r'^dc\.title',
    'abstract': r'^dc\.description\.abstract',
    'keyword': r'^dc\.subject\.keyword',
    'author': r'^dc\.contributor\.author',
    'type': r'^dc\.type',
    'url': r'^dc\.identifier\.uri',
}

# Function to extract columns based on regex pattern and select the first non-null value
def extract_columns(df, pattern):
    matching_columns = [col for col in df.columns if re.match(pattern, col)]
    if matching_columns:
        # Select the first non-null value across the matching columns
        return df[matching_columns].bfill(axis=1).iloc[:, 0]
    else:
        return pd.Series([None] * len(df))

# Function to truncate text to a specified number of words
def truncate_text(text, max_words=1000):
    if pd.isna(text):
        return text
    words = text.split()
    if len(words) > max_words:
        return ' '.join(words[:max_words]) + '...'
    return text

# Function to check the maximum word length in a column
def check_max_word_length(df, column_name):
    max_length = df[column_name].apply(lambda x: len(str(x).split()) if pd.notna(x) else 0).max()
    return max_length

# Loop through each folder
for folder in folders:
    # Loop through each CSV file in the folder
    for filename in os.listdir(f"/content/drive/MyDrive/Colab Notebooks/dataset_magang/{folder}/csv/"):
        if filename.endswith('.csv'):
            # Construct the file path
            file_path = os.path.join(f"/content/drive/MyDrive/Colab Notebooks/dataset_magang/{folder}/csv/", filename)

            # Read the CSV file into a DataFrame
            df = pd.read_csv(file_path)

            # Extract department name from the filename (assuming the filename contains the department)
            department_name = os.path.splitext(filename)[0]  # This will give you the filename without the extension

            # Create a simplified DataFrame using regex for column matching
            simplified_df = pd.DataFrame()

            # Apply regex-based column extraction for each key in patterns
            for key, pattern in patterns.items():
                simplified_df[key] = extract_columns(df, pattern)

            # Add a 'department' column to the DataFrame
            simplified_df['department'] = department_name

            # Fill 'type' column with folder name if it is NaN
            simplified_df['type'] = simplified_df['type'].fillna(folder)

            # Append the simplified DataFrame to the combined DataFrame
            combined_df = pd.concat([combined_df, simplified_df], ignore_index=True)

# Drop rows with all NaN values in the combined DataFrame
combined_df_clean = combined_df.dropna(how='all')

# Drop rows where the 'abstract' column is null
combined_df_clean = combined_df_clean.dropna(subset=['abstract'])

# Replace '||' with ', ' in specified columns
columns_to_replace = ['abstract', 'author', 'keyword', 'title', 'type']
for column in columns_to_replace:
    combined_df_clean[column] = combined_df_clean[column].str.replace('||', ', ', regex=False)

# Check max word length in 'abstract' column before truncation
max_length_before = check_max_word_length(combined_df_clean, 'abstract')
print(f"Maximum word length in 'abstract' before truncation: {max_length_before}")

# Truncate abstracts to 1000 words
combined_df_clean['abstract'] = combined_df_clean['abstract'].apply(truncate_text)

# Check max word length in 'abstract' column after truncation
max_length_after = check_max_word_length(combined_df_clean, 'abstract')
print(f"Maximum word length in 'abstract' after truncation: {max_length_after}")

# Save the combined and simplified DataFrame to a new CSV file
combined_df_clean.to_csv('df.csv', index=False)

print("Combined and simplified CSV created successfully with truncated abstracts.")

Maximum word length in 'abstract' before truncation: 4029
Maximum word length in 'abstract' after truncation: 1000
Combined and simplified CSV created successfully with truncated abstracts.


In [4]:
combined_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 115145 entries, 0 to 122976
Data columns (total 7 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   title       115145 non-null  object
 1   abstract    115145 non-null  object
 2   keyword     65311 non-null   object
 3   author      115071 non-null  object
 4   type        115145 non-null  object
 5   url         115144 non-null  object
 6   department  115145 non-null  object
dtypes: object(7)
memory usage: 7.0+ MB


In [5]:
combined_df_clean.head()

Unnamed: 0,title,abstract,keyword,author,type,url,department
0,"Comparing Visual Attention, AI-Reface, and Hum...",Understanding the differential impact of human...,"Intelligence, Brand, Endorser, Neuromarketing,...","Akbari, Deni Adha",Disertasi,http://repository.ipb.ac.id/handle/123456789/1...,DT - Magister Business
1,Rancang Bangun Model Pelayanan Perizinan Inves...,Investasi swasta merupakan bagian sumber pembi...,"analytical hierarchy process (AHP), soft syste...","Subowo, Eko",Dissertation,http://repository.ipb.ac.id/handle/123456789/1...,DT - Magister Business
2,Talent Management Strategy Development for Fut...,Kinerja berkelanjutan merupakan suatu kondisi ...,"Kinerja pegawai, Manajemen Talenta, employee p...","Hartono, Imam",Disertasi,http://repository.ipb.ac.id/handle/123456789/1...,DT - Magister Business
3,Development of Artificial Neural Network Model...,Berdasarkan Undang-Undang Nomor 10 tahun 1998 ...,"faktor internal, Jaringan Saraf Tiruan, intern...","Solihati, Garin Pratiwi",Disertasi,http://repository.ipb.ac.id/handle/123456789/1...,DT - Magister Business
4,"Kinerja Daya Saing, Efisiensi, Profitabilitas,...","RIRIS SHANTI. Kinerja Daya Saing, Efisiensi, P...","Bank Digital, daya saing, Efisiensi, profitabi...","Shanti, Riris",Disertasi,http://repository.ipb.ac.id/handle/123456789/1...,DT - Magister Business


In [6]:
df = pd.read_csv('/content/df.csv')

df['combined_text'] = df['title'] + ' ' + df['abstract']



print(len(df['combined_text'][74].split()))

382


In [7]:
!pip install -U sympy

Collecting sympy
  Downloading sympy-1.13.3-py3-none-any.whl.metadata (12 kB)
Downloading sympy-1.13.3-py3-none-any.whl (6.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.2/6.2 MB[0m [31m37.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sympy
  Attempting uninstall: sympy
    Found existing installation: sympy 1.13.1
    Uninstalling sympy-1.13.1:
      Successfully uninstalled sympy-1.13.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torch 2.5.1+cu121 requires sympy==1.13.1; python_version >= "3.9", but you have sympy 1.13.3 which is incompatible.[0m[31m
[0mSuccessfully installed sympy-1.13.3


In [8]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np

# Load pre-trained model for embeddings
model_name = "sentence-transformers/all-MiniLM-L6-v2"  # Example model
model = SentenceTransformer(model_name)

# Load your dataframe
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/df.csv')

# Create 'combined_text' column here, before accessing it
df['combined_text'] = df['title'] + ' ' + df['abstract']

# Assuming the text you want to embed is in the 'combined_text' column
# You previously created 'combined_text' from 'title' and 'abstract'
texts = df['combined_text'].tolist()  # Changed 'text' to 'combined_text'

# Generate embeddings for each text entry
embeddings = model.encode(texts, show_progress_bar=True)

# Add embeddings to your dataframe (as strings, for later storage)
df['embedding'] = [str(embedding) for embedding in embeddings]

# Save the dataframe with embeddings
df.to_csv('df_with_embeddings.csv', index=False)

print(f"Generated embeddings and saved to 'df_with_embeddings.csv'")

  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/3599 [00:00<?, ?it/s]

Generated embeddings and saved to 'df_with_embeddings.csv'


In [5]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np

# Load the dataframe with embeddings
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/df_with_embeddings.csv')

# Convert embeddings back from string to numpy arrays
df['embedding'] = df['embedding'].apply(lambda x: np.fromstring(x.strip('[]'), sep=','))

# Parameters
top_n_positives = 2  # Number of positive samples per anchor
top_n_negatives = 2  # Number of negative samples per anchor

# Prepare the embeddings and texts
all_embeddings = np.vstack(df['embedding'].values)  # Stack all embeddings vertically
# Instead of 'text', use 'combined_text' which contains the combined title and abstract
all_texts = df['combined_text'].tolist()  # Using the 'combined_text' column

# Prepare to store triplets
triplets = []

# Loop through each anchor and create triplets
for i in range(len(all_embeddings)):
    # Calculate cosine similarity between the i-th embedding and all other embeddings
    sim_scores = cosine_similarity([all_embeddings[i]], all_embeddings)[0]

    # Get indices of the top N most similar (positive) and least similar (negative) rows
    positive_indices = np.argsort(sim_scores)[::-1][1:top_n_positives + 1]  # Top similar, excluding self
    negative_indices = np.argsort(sim_scores)[:top_n_negatives]  # Least similar

    anchor = all_texts[i]
    positives = [all_texts[idx] for idx in positive_indices]
    negatives = [all_texts[idx] for idx in negative_indices]

    # Create triplets (anchor, positive, negative)
    for pos in positives:
        for neg in negatives:
            triplets.append((anchor, pos, neg))

# Save triplets to a new CSV file
triplets_df = pd.DataFrame(triplets, columns=['anchor', 'positive', 'negative'])
triplets_df.to_csv('triplets.csv', index=False)

print(f"Generated {len(triplets)} triplets and saved to 'triplets.csv'")

  df['embedding'] = df['embedding'].apply(lambda x: np.fromstring(x.strip('[]'), sep=','))


Generated 460580 triplets and saved to 'triplets.csv'


In [10]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [None]:
from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, SentenceTransformerTrainingArguments
from sentence_transformers.losses import MultipleNegativesRankingLoss
from datasets import Dataset

# Load the pre-trained model
model_name = "sentence-transformers/paraphrase-MiniLM-L6-v2"  # Smaller model to reduce memory usage
model = SentenceTransformer(model_name)

import pandas as pd  # Import pandas

# Load the triplet dataset
triplets_df = pd.read_csv('triplets.csv')


# Convert triplet data to the format needed for training
train_data = []
for _, row in triplets_df.iterrows():
    anchor = row['anchor']
    positive = row['positive']
    negative = row['negative']
    train_data.append({'anchor': anchor, 'positive': positive, 'negative': negative})

# Convert to Dataset object
train_dataset = Dataset.from_pandas(pd.DataFrame(train_data))

# Loss function
loss = MultipleNegativesRankingLoss(model)

# Fine-tuning arguments with memory optimizations
training_args = SentenceTransformerTrainingArguments(
    output_dir="./finetuned_model",
    num_train_epochs=1,
    per_device_train_batch_size=4,  # Reduce this number
    save_steps=2000,
    logging_steps=500,
    gradient_accumulation_steps=4,  # Optional: accumulate gradients to compensate for smaller batch size
)


# Trainer
trainer = SentenceTransformerTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    loss=loss,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained('./finetuned_model')

print("Fine-tuning complete and model saved.")

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# /content/triplets.csv

In [3]:
from google.colab import files

files.download('triplets.csv')

FileNotFoundError: Cannot find file: triplets.csv

In [1]:
!ls -l triplets.csv

-rw-r--r-- 1 root root 3596931421 Dec 12 07:28 triplets.csv
