# LG01 - OBJECTIVE QUEST

# DEPEDENCIES

In [None]:
import pandas as pd
import numpy as np
import re
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from scipy.sparse import hstack, csr_matrix, save_npz, load_npz
import gc
import warnings
warnings.filterwarnings('ignore')

# PREPROCESS DATA & FEATURE ENGINEERING

In [None]:
df_train = pd.read_csv("/kaggle/input/dataset/train.csv")
df_test = pd.read_csv("/kaggle/input/dataset/test.csv")

base_dir = Path('/kaggle/input/dataset/file_putusan/file_putusan')
if not base_dir.exists():
    base_dir = Path('/kaggle/input/dataset/file_putusan')
    if not base_dir.exists():
        import os
        for root, dirs, files in os.walk('/kaggle/input/dataset/'):
            if any('.txt' in f for f in files):
                base_dir = Path(root)
                break

txt_files = list(base_dir.rglob("*.txt"))

text_dict = {}
for i, path in enumerate(txt_files):
    try:
        file_id = path.name.replace('.txt', '')
        content = path.read_text(encoding='utf-8', errors='ignore')
        text_dict[file_id] = content
    except:
        pass

text_df = pd.DataFrame(list(text_dict.items()), columns=['id', 'content'])
del text_dict
gc.collect()

for df in [df_train, df_test, text_df]:
    df['id'] = df['id'].astype(str)

df_train = pd.merge(df_train, text_df, on='id', how='left')
df_test = pd.merge(df_test, text_df, on='id', how='left')
df_train['content'] = df_train['content'].fillna('')
df_test['content'] = df_test['content'].fillna('')
del text_df
gc.collect()

def preprocess_text(text):
    if pd.isna(text) or text == '':
        return ''
    text = str(text).lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df_train['clean_text'] = df_train['content'].apply(preprocess_text)
df_test['clean_text'] = df_test['content'].apply(preprocess_text)
df_train.drop('content', axis=1, inplace=True)
df_test.drop('content', axis=1, inplace=True)
gc.collect()

all_text = pd.concat([df_train['clean_text'], df_test['clean_text']], ignore_index=True)

print("Saving preprocessed data...")
df_train.to_csv('train_preprocessed.csv', index=False)
df_test.to_csv('test_preprocessed.csv', index=False)
all_text.to_csv('all_text.csv', index=False)

del df_train, df_test, all_text
gc.collect()

Saving preprocessed data...


0

# TEXT VECTORIZATION

In [None]:
all_text_df = pd.read_csv('all_text.csv')
all_text = all_text_df.iloc[:, 0].tolist()
df_train = pd.read_csv('train_preprocessed.csv')
df_test = pd.read_csv('test_preprocessed.csv')
del all_text_df

vec1 = TfidfVectorizer(ngram_range=(1, 2), max_features=1500, min_df=2, max_df=0.95, sublinear_tf=True)
vec1.fit(all_text)
train_feat1 = vec1.transform(df_train['clean_text'])
test_feat1 = vec1.transform(df_test['clean_text'])

save_npz('train_feat1.npz', train_feat1)
save_npz('test_feat1.npz', test_feat1)
print(f"Batch 1: {train_feat1.shape[1]} features saved")

del vec1, train_feat1, test_feat1
gc.collect()

Batch 1: 1500 features saved


0

In [None]:
vec2 = TfidfVectorizer(ngram_range=(1, 3), max_features=1200, min_df=2, max_df=0.90, sublinear_tf=True)
vec2.fit(all_text)
train_feat2 = vec2.transform(df_train['clean_text'])
test_feat2 = vec2.transform(df_test['clean_text'])

save_npz('train_feat2.npz', train_feat2)
save_npz('test_feat2.npz', test_feat2)
print(f"Batch 2: {train_feat2.shape[1]} features saved")

del vec2, train_feat2, test_feat2
gc.collect()

Batch 2: 1200 features saved


0

In [None]:
vec3 = TfidfVectorizer(ngram_range=(2, 3), max_features=1000, min_df=2, max_df=0.85, sublinear_tf=True)
vec3.fit(all_text)
train_feat3 = vec3.transform(df_train['clean_text'])
test_feat3 = vec3.transform(df_test['clean_text'])

save_npz('train_feat3.npz', train_feat3)
save_npz('test_feat3.npz', test_feat3)
print(f"Batch 3: {train_feat3.shape[1]} features saved")

del vec3, train_feat3, test_feat3
gc.collect()

Batch 3: 1000 features saved


0

In [None]:
vec4 = TfidfVectorizer(analyzer='char', ngram_range=(2, 4), max_features=800, min_df=2, sublinear_tf=True)
vec4.fit(all_text)
train_feat4 = vec4.transform(df_train['clean_text'])
test_feat4 = vec4.transform(df_test['clean_text'])

save_npz('train_feat4.npz', train_feat4)
save_npz('test_feat4.npz', test_feat4)
print(f"Batch 4: {train_feat4.shape[1]} features saved")

del vec4, train_feat4, test_feat4
gc.collect()

Batch 4: 800 features saved


0

In [None]:
vec5 = TfidfVectorizer(analyzer='char', ngram_range=(3, 5), max_features=600, min_df=2, sublinear_tf=True)
vec5.fit(all_text)
train_feat5 = vec5.transform(df_train['clean_text'])
test_feat5 = vec5.transform(df_test['clean_text'])

save_npz('train_feat5.npz', train_feat5)
save_npz('test_feat5.npz', test_feat5)
print(f"Batch 5: {train_feat5.shape[1]} features saved")

del vec5, train_feat5, test_feat5
gc.collect()

Batch 5: 600 features saved


0

In [None]:
vec6 = CountVectorizer(ngram_range=(1, 2), max_features=400, min_df=2)
vec6.fit(all_text)
train_feat6 = vec6.transform(df_train['clean_text'])
test_feat6 = vec6.transform(df_test['clean_text'])

save_npz('train_feat6.npz', train_feat6)
save_npz('test_feat6.npz', test_feat6)
print(f"Batch 6: {train_feat6.shape[1]} features saved")

del vec6, train_feat6, test_feat6, all_text
gc.collect()

Batch 6: 400 features saved


0

In [None]:
train_feat1 = load_npz('train_feat1.npz')
train_feat2 = load_npz('train_feat2.npz')
train_feat3 = load_npz('train_feat3.npz')
train_feat4 = load_npz('train_feat4.npz')
train_feat5 = load_npz('train_feat5.npz')
train_feat6 = load_npz('train_feat6.npz')

test_feat1 = load_npz('test_feat1.npz')
test_feat2 = load_npz('test_feat2.npz')
test_feat3 = load_npz('test_feat3.npz')
test_feat4 = load_npz('test_feat4.npz')
test_feat5 = load_npz('test_feat5.npz')
test_feat6 = load_npz('test_feat6.npz')

X_train_combined = hstack([train_feat1, train_feat2, train_feat3, train_feat4, train_feat5, train_feat6]).tocsr()
X_test_combined = hstack([test_feat1, test_feat2, test_feat3, test_feat4, test_feat5, test_feat6]).tocsr()

del train_feat1, train_feat2, train_feat3, train_feat4, train_feat5, train_feat6
del test_feat1, test_feat2, test_feat3, test_feat4, test_feat5, test_feat6
gc.collect()

print(f"TOTAL FEATURES: {X_train_combined.shape[1]}")
print(f"Train shape: {X_train_combined.shape}")
print(f"Test shape: {X_test_combined.shape}")

TOTAL FEATURES: 5500
Train shape: (16572, 5500)
Test shape: (6666, 5500)


# MODELLING

In [None]:
y_train = df_train['lama hukuman (bulan)'].values
outlier_threshold = np.percentile(y_train, 95)
outlier_mask = y_train <= outlier_threshold

X_train_clean = X_train_combined[outlier_mask]
y_train_clean = y_train[outlier_mask]

X_train, X_val, y_train_split, y_val = train_test_split(
    X_train_clean, y_train_clean, test_size=0.20, random_state=42
)

xgb_model = XGBRegressor(
    n_estimators=600,
    learning_rate=0.1,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=1.0,
    objective='reg:squarederror',
    random_state=42,
    n_jobs=-1,
    tree_method='hist',
    early_stopping_rounds=80
)

xgb_model.fit(
    X_train, y_train_split,
    eval_set=[(X_train, y_train_split), (X_val, y_val)],
    eval_metric='rmse',
    verbose=50
)

y_val_pred = xgb_model.predict(X_val)
val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))

print(f"VALIDATION RMSE: {val_rmse:.4f}")

[0]	validation_0-rmse:26.54027	validation_1-rmse:26.16418
[50]	validation_0-rmse:11.15336	validation_1-rmse:14.19617
[100]	validation_0-rmse:9.08741	validation_1-rmse:13.86788
[150]	validation_0-rmse:7.71623	validation_1-rmse:13.76963
[200]	validation_0-rmse:6.61529	validation_1-rmse:13.69654
[250]	validation_0-rmse:5.75196	validation_1-rmse:13.65877
[300]	validation_0-rmse:5.04600	validation_1-rmse:13.61157
[350]	validation_0-rmse:4.46132	validation_1-rmse:13.58084
[400]	validation_0-rmse:3.91379	validation_1-rmse:13.56921
[450]	validation_0-rmse:3.45484	validation_1-rmse:13.55642
[500]	validation_0-rmse:3.04643	validation_1-rmse:13.54896
[550]	validation_0-rmse:2.70720	validation_1-rmse:13.53450
[599]	validation_0-rmse:2.42685	validation_1-rmse:13.53582
VALIDATION RMSE: 13.5308


## TEST PREDICTION AND MAKING SUBMISSION FILE

In [None]:
test_predictions = xgb_model.predict(X_test_combined)

submission_df = pd.DataFrame({
    'id': df_test['id'],
    'lama hukuman (bulan)': test_predictions
})

submission_df.to_csv('finalhopesubmit.csv', index=False)
print("SUBMISSION CREATED!")
print(f"Final RMSE: {val_rmse:.4f}")

SUBMISSION CREATED!
Final RMSE: 13.5308


# Save Model

In [None]:
import pickle

with open('xgb_model.pkl', 'wb') as f:
    pickle.dump(xgb_model, f)