In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os

In [28]:
# Load the data
path = 'data/transliterasi_id_juz_30_filter.csv'
data = pd.read_csv(path, sep="|", header=None, quoting=3) # filename|text|translit
data.columns = ['filename', 'text', 'translit']

# shuffle the data
data = data.sample(frac=1, random_state=10000).reset_index(drop=True)

# Splitting ratios
train_ratio = 0.79
validation_ratio = 0.10
test_ratio = 0.11

# Get the riciter
riciters = data['filename'].str.split('_').str[1].unique()
riciters = pd.DataFrame(riciters, columns=['riciter'])
riciters['riciter_id'] = riciters.index

# Map riciter to the data
data['riciter'] = data['filename'].str.split('_').str[1]
data = data.merge(riciters, on='riciter')

# Group data by riciter_id and split
grouped_data = data.groupby('riciter_id')

train_data = pd.DataFrame()
validation_data = pd.DataFrame()
test_data = pd.DataFrame()

for riciter_id, group in grouped_data:
    train, temp = train_test_split(group, test_size=(validation_ratio + test_ratio), random_state=1000)
    validation, test = train_test_split(temp, test_size=(test_ratio / (validation_ratio + test_ratio)), random_state=1000)
    
    train_data = pd.concat([train_data, train])
    validation_data = pd.concat([validation_data, validation])
    test_data = pd.concat([test_data, test])

# Drop the riciter_id column
train_data.drop(columns=['riciter_id'], inplace=True)
validation_data.drop(columns=['riciter_id'], inplace=True)
test_data.drop(columns=['riciter_id'], inplace=True)

# Drop the riciter column
train_data.drop(columns=['riciter'], inplace=True)
validation_data.drop(columns=['riciter'], inplace=True)
test_data.drop(columns=['riciter'], inplace=True)

# Save the splits into separate CSV files
train_data.to_csv('data/train.csv', sep='|', index=False, header=False)
validation_data.to_csv('data/validation.csv', sep='|', index=False, header=False)
test_data.to_csv('data/test.csv', sep='|', index=False, header=False)
