In [17]:
import pandas as pd
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

file_path = "SDG_frequencies_absolute.txt"

# coba autodetect delimiter
df = pd.read_csv(file_path, sep=None, engine="python")

print("âœ… Dataset dimuat:", df.shape)
print(df.head())             # tampilkan 5 baris pertama
print("Kolom:", df.columns.tolist())


âœ… Dataset dimuat: (545, 27)
   Unnamed: 0                      doc_id  sdg01  sdg02  sdg03  sdg04  sdg05  \
0           1  BE0003739530_2016_SR_0.pdf      1      0      7      0      0   
1           2  BE0003739530_2020_IR_0.pdf      2      0      7      0      0   
2           3  BE0974293251_2016_SR_0.pdf      2      3      0      0      0   
3           4  BE0974293251_2020_IR_0.pdf      1      0     16      0      2   
4           5  CH0008742519_2016_SR_0.pdf      0      0      2      1      0   

   sdg06  sdg07  sdg08  ...  sdg16  sgd17  sdg  gc  gri  int       company  \
0      0      3      2  ...      3      4   12   1   13    0  BE0003739530   
1      0      1      0  ...      6      5   22   0   40    0  BE0003739530   
2      5      1      3  ...      3      5    8   0    3    0  BE0974293251   
3     11     10      4  ...      0      8   10   0    2    0  BE0974293251   
4      0     53      0  ...      3      5    3   0   39    0  CH0008742519   

   country  year  ct

In [18]:
# ambil kolom yang mengandung 'sdg'
sdg_cols = [c for c in df.columns if "sdg" in c.lower()]
print("Kolom SDG terdeteksi:", sdg_cols)

X = df[sdg_cols]
y = df[sdg_cols].idxmax(axis=1)

print("Contoh fitur:", X.iloc[0].to_dict())
print("Label:", y.iloc[0])

Kolom SDG terdeteksi: ['sdg01', 'sdg02', 'sdg03', 'sdg04', 'sdg05', 'sdg06', 'sdg07', 'sdg08', 'sdg09', 'sdg10', 'sdg11', 'sdg12', 'sdg13', 'sdg14', 'sdg15', 'sdg16', 'sdg']
Contoh fitur: {'sdg01': 1, 'sdg02': 0, 'sdg03': 7, 'sdg04': 0, 'sdg05': 0, 'sdg06': 0, 'sdg07': 3, 'sdg08': 2, 'sdg09': 0, 'sdg10': 1, 'sdg11': 0, 'sdg12': 0, 'sdg13': 3, 'sdg14': 0, 'sdg15': 0, 'sdg16': 3, 'sdg': 12}
Label: sdg


In [19]:
# 1) Load dataset
data_path = "SDG_frequencies_absolute.txt"
df = pd.read_csv(data_path, sep=None, engine="python")

print("âœ… Dataset dimuat:", df.shape)
print("Kolom:", df.columns.tolist()[:20])

# 2) Ambil hanya kolom sdg01..sdg17 sebagai fitur
sdg_cols = [c for c in df.columns if c.lower().startswith("sdg")]
print("Kolom SDG:", sdg_cols)

X = df[sdg_cols]            # fitur = frekuensi SDG
y = df[sdg_cols].idxmax(axis=1)  # label = SDG dominan

print("\nContoh fitur baris 1:\n", X.iloc[0].to_dict())
print("Label:", y.iloc[0])

# 3) Train/Test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 4) Train Logistic Regression
model = LogisticRegression(max_iter=500, multi_class="multinomial", solver="lbfgs")
model.fit(X_train, y_train)

# 5) Evaluasi
y_pred = model.predict(X_test)
print("\nâœ… Akurasi:", accuracy_score(y_test, y_pred))
print("\n", classification_report(y_test, y_pred))

# 6) Simpan model
with open("csr_sdg_logreg_model.pkl", "wb") as f:
    pickle.dump(model, f)

print("\nðŸ’¾ Model berhasil disimpan sebagai csr_sdg_logreg_model.pkl")


âœ… Dataset dimuat: (545, 27)
Kolom: ['Unnamed: 0', 'doc_id', 'sdg01', 'sdg02', 'sdg03', 'sdg04', 'sdg05', 'sdg06', 'sdg07', 'sdg08', 'sdg09', 'sdg10', 'sdg11', 'sdg12', 'sdg13', 'sdg14', 'sdg15', 'sdg16', 'sgd17', 'sdg']
Kolom SDG: ['sdg01', 'sdg02', 'sdg03', 'sdg04', 'sdg05', 'sdg06', 'sdg07', 'sdg08', 'sdg09', 'sdg10', 'sdg11', 'sdg12', 'sdg13', 'sdg14', 'sdg15', 'sdg16', 'sdg']

Contoh fitur baris 1:
 {'sdg01': 1, 'sdg02': 0, 'sdg03': 7, 'sdg04': 0, 'sdg05': 0, 'sdg06': 0, 'sdg07': 3, 'sdg08': 2, 'sdg09': 0, 'sdg10': 1, 'sdg11': 0, 'sdg12': 0, 'sdg13': 3, 'sdg14': 0, 'sdg15': 0, 'sdg16': 3, 'sdg': 12}
Label: sdg

âœ… Akurasi: 0.908256880733945

               precision    recall  f1-score   support

         sdg       1.00      0.83      0.91        12
       sdg01       0.25      1.00      0.40         2
       sdg02       0.00      0.00      0.00         2
       sdg03       1.00      0.60      0.75         5
       sdg04       1.00      1.00      1.00         1
       sdg05     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [20]:
# Load model
import pickle
model = pickle.load(open("csr_sdg_logreg_model.pkl","rb"))

# Ambil contoh baris dari dataset
sample = X_test.iloc[0:1]  # satu baris fitur
print("Fitur input:", sample.to_dict())

# Prediksi
pred = model.predict(sample)
print("Prediksi:", pred)


Fitur input: {'sdg01': {76: 3}, 'sdg02': {76: 2}, 'sdg03': {76: 2}, 'sdg04': {76: 0}, 'sdg05': {76: 0}, 'sdg06': {76: 6}, 'sdg07': {76: 26}, 'sdg08': {76: 1}, 'sdg09': {76: 1}, 'sdg10': {76: 6}, 'sdg11': {76: 0}, 'sdg12': {76: 0}, 'sdg13': {76: 20}, 'sdg14': {76: 0}, 'sdg15': {76: 2}, 'sdg16': {76: 13}, 'sdg': {76: 14}}
Prediksi: ['sdg07']
