In [None]:
%pip install gensim



In [None]:
# prompt: import file form drive

from google.colab import drive
drive.mount('/content/drive')



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
# Save the preprocessed DataFrames to new CSV files
proposal_df = pd.read_csv('/content/drive/MyDrive/Skripsi3/Dataset/processed_proposalC.csv')
expert_df = pd.read_csv('/content/drive/MyDrive/Skripsi3/Dataset/processed_expertC.csv')

In [None]:
import numpy as np
import gensim
from gensim import corpora
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel
import ast
from gensim.models import Phrases
from gensim.models.phrases import Phraser

# Load Data

In [None]:
def convert_to_list(text):
    try:
        return ast.literal_eval(text) if isinstance(text, str) else text
    except:
        return []


proposal_df["processed"] = proposal_df["stemmed"].apply(convert_to_list)
expert_df["processed"] = expert_df["stemmed"].apply(convert_to_list)

# Gabungkan semua dokumen untuk membuat satu kamus bersama
documents_all = proposal_df["processed"].tolist() + expert_df["processed"].tolist()
dictionary_all = Dictionary(documents_all)

dictionary_all.filter_extremes(no_below=5, no_above=0.5)

proposal_corpus = [dictionary_all.doc2bow(doc) for doc in proposal_df["processed"]]
expert_corpus = [dictionary_all.doc2bow(doc) for doc in expert_df["processed"]]

corpus_all= proposal_corpus + expert_corpus

In [None]:
print(len(documents_all))
print(len(corpus_all))
print(len(dictionary_all))

902
902
3406


In [None]:
# # Misalnya kita ambil satu dokumen dari corpus_all
# doc_bow = corpus_all[0]

# # # Tampilkan representasi kata dari dokumen tersebut
# for word_id, freq in doc_bow:
#     word = dictionary_all[word_id]  # ambil kata dari kamus berdasarkan indeks
#     print(f"('{word}', {freq})")

# Model LDA

In [None]:
from itertools import combinations

num_topics = 14  # atau optimalisasi via coherence

lda_model = LdaModel(
    corpus=corpus_all,
    id2word=dictionary_all,
    num_topics=num_topics,
    passes=50,
    random_state=42,
    iterations= 400,
    alpha=0.5,
    eta=0.01

)

coherence_model_lda = CoherenceModel(model=lda_model, texts=proposal_df['processed'].tolist() + expert_df['processed'].tolist(), dictionary=dictionary_all, coherence='c_v')
coherence_score = coherence_model_lda.get_coherence()
print(f"Coherence Score: {coherence_score}")

# Hitung topic diversity
def compute_topic_diversity(lda_model, topn=10):
    topic_words = [set([word for word, _ in lda_model.show_topic(topicid, topn=topn)])
                   for topicid in range(lda_model.num_topics)]
    pairwise_intersections = [
        len(t1 & t2) / len(t1 | t2)
        for t1, t2 in combinations(topic_words, 2)
    ]
    avg_jaccard_similarity = sum(pairwise_intersections) / len(pairwise_intersections)
    # diversity_score = 1 - avg_jaccard_similarity  # semakin besar, semakin beragam
    return round(avg_jaccard_similarity, 4)

diversity_score = compute_topic_diversity(lda_model, topn=10)
print(f"Diversity Score: {diversity_score}")

Coherence Score: 0.43131366590542786
Diversity Score: 0.0628


In [None]:
import pandas as pd

data = []

for topic_id in range(num_topics):
    top_words = lda_model.show_topic(topic_id, topn=25)
    words = [word for word, prob in top_words]
    probs = [round(prob, 4) for word, prob in top_words]

    data.append({
        'Topik': topic_id + 1,
        'Kata Kunci': ', '.join(words),
        'Probabilitas': ', '.join(map(str, probs))
    })

df = pd.DataFrame(data)

# Tampilkan hasil
# print(df)

# Simpan ke CSV jika diinginkan
# df.to_csv('/content/drive/MyDrive/Skripsi4/dictionary/topik_kata_dan_probabilitas_14_baru_top25word.csv', index=False, encoding='utf-8-sig')



In [None]:
from gensim.matutils import sparse2full

# Fungsi untuk mendapatkan dense topic vector
def get_topic_vector(lda_model, dictionary, document, num_topics):
    bow = dictionary.doc2bow(document)
    topic_dist = lda_model.get_document_topics(bow, minimum_probability=0.00001)
    return sparse2full(topic_dist, num_topics)

# Ambil jumlah topik dari model LDA
num_topics = lda_model.num_topics

# Hitung topik vektor dalam bentuk dense array (untuk expert)
expert_df["topic_vector"] = [
    get_topic_vector(lda_model, dictionary_all, doc, num_topics)
    for doc in expert_df["processed"]
]

# Hitung topik vektor dalam bentuk dense array (untuk proposal)
proposal_df["topic_vector"] = [
    get_topic_vector(lda_model, dictionary_all, doc, num_topics)
    for doc in proposal_df["processed"]
]


In [None]:
import numpy as np
import pandas as pd

topic_matrix_expert = np.vstack(expert_df["topic_vector"])
topic_df = pd.DataFrame(
    topic_matrix_expert,
    columns=[f"topic_{i+1}" for i in range(lda_model.num_topics)]
)

topic_df["research_id"] = expert_df["research_id"].values
topic_df["name"] = expert_df["name"].values
topic_df["expert_id"] = expert_df["expert_id"].values

# Set kolom id_dosen sebagai index
topic_df.set_index("research_id", inplace=True)
# topic_df = topic_df[~topic_df.index.duplicated(keep="first")]

# (Opsional) Tampilkan 5 baris pertama untuk verifikasi
print(topic_df.head())

# print(topic_df)
# topic_df.to_csv('/content/drive/MyDrive/Skripsi4/dictionary/vektorExpert_14_baru.csv', index=True)


              topic_1   topic_2   topic_3   topic_4   topic_5   topic_6  \
research_id                                                               
R1           0.005264  0.044919  0.005749  0.005661  0.159443  0.048705   
R1           0.005264  0.044919  0.005749  0.005661  0.159443  0.048705   
R2           0.006275  0.006679  0.344286  0.005781  0.006000  0.274839   
R3           0.004788  0.004560  0.382559  0.004312  0.010355  0.004713   
R3           0.004788  0.004560  0.382559  0.004312  0.010355  0.004713   

              topic_7   topic_8   topic_9  topic_10  topic_11  topic_12  \
research_id                                                               
R1           0.376329  0.005356  0.005144  0.297490  0.005534  0.005274   
R1           0.376329  0.005356  0.005144  0.297490  0.005534  0.005274   
R2           0.041040  0.134105  0.006468  0.006391  0.007006  0.006133   
R3           0.546399  0.004910  0.005227  0.005206  0.005002  0.011567   
R3           0.546399  0

In [None]:
# Buat dataframe dari vektor topik dosen
topic_matrix_expert = np.vstack(expert_df["topic_vector"])
topic_df = pd.DataFrame(topic_matrix_expert, columns=[f"topic_{i+1}" for i in range(topic_matrix_expert.shape[1])])

# Gabungkan dengan nama dosen
topic_df["name"] = expert_df["name"].values
topic_df["research_id"] = expert_df["research_id"].values
topic_df["expert_id"] = expert_df["expert_id"].values
topic_df["author_position"] = expert_df["author_position"].values
topic_df["author_weight"] = expert_df["author_weight"].values
topic_df["pub_year"] = expert_df["pub_year"].values

# Tambahkan kolom topik dominan
topic_df["topik_dominan"] = topic_df[[f"topic_{i+1}" for i in range(topic_matrix_expert.shape[1])]].idxmax(axis=1)
topic_df["nilai_topik_dominan"] = topic_df[[f"topic_{i+1}" for i in range(topic_matrix_expert.shape[1])]].max(axis=1)

# Tampilkan contoh
print(topic_df[["research_id","pub_year", "expert_id","name","author_position", "author_weight", "topik_dominan", "nilai_topik_dominan"]].head(10))


  research_id  pub_year expert_id             name  author_position  \
0          R1      2015       D12          Wiharto                1   
1          R1      2015        D5       Abdul Aziz                2   
2          R2      2015        D5       Abdul Aziz                1   
3          R3      2016        D5       Abdul Aziz                1   
4          R3      2016        D1  Bambang Harjito                2   
5          R4      2017        D5       Abdul Aziz                1   
6          R4      2017        D8     Esti Suryani                2   
7          R5      2018        D5       Abdul Aziz                1   
8          R6      2018        D5       Abdul Aziz                1   
9          R7      2018        D2          Wiranto                1   

   author_weight topik_dominan  nilai_topik_dominan  
0            0.6       topic_7             0.376329  
1            0.4       topic_7             0.376329  
2            1.0       topic_3             0.344286  
3 

In [None]:
topik_label = {
    "topic_1":  "Pengolahan Citra dan Transformasi Digital",
    "topic_2":  "Clustering Citra dan Ekstraksi Fitur",
    "topic_3":  "Data Mining untuk E-Commerce dan Rekomendasi",
    "topic_4":  "Deteksi dan Klasifikasi Penyakit dari Citra Medis",
    "topic_5":  "Deep Learning CNN untuk Citra dan Segmentasi",
    "topic_6":  "IoT dan Pemantauan Lingkungan Berbasis Sensor",
    "topic_7":  "Diagnosis Penyakit Menggunakan Machine Learning",
    "topic_8":  "Sistem Pendukung Keputusan dalam Dunia Pendidikan",
    "topic_9":  "Analisis Sentimen dan Teks di Media Sosial",
    "topic_10": "Klasifikasi Data Akademik dengan Metode Machine Learning",
    "topic_11": "Pemrosesan Bahasa Alami dan Representasi Teks",
    "topic_12": "Restorasi Citra dan Denoising dengan Deep Learning",
    "topic_13": "Sistem Rekomendasi dan Klasifikasi Naive Bayes",
    "topic_14": "Kriptografi, Keamanan Data, dan Enkripsi"
}

topic_df["label_topik_dominan"] = topic_df["topik_dominan"].map(topik_label)
print(topic_df[["research_id","pub_year", "expert_id","name","author_position", "author_weight", "topik_dominan", "nilai_topik_dominan"]].head(10))

topic_df.to_csv('/content/drive/MyDrive/Skripsi4/dictionary/labeltopik14.csv', index=True)


  research_id  pub_year expert_id             name  author_position  \
0          R1      2015       D12          Wiharto                1   
1          R1      2015        D5       Abdul Aziz                2   
2          R2      2015        D5       Abdul Aziz                1   
3          R3      2016        D5       Abdul Aziz                1   
4          R3      2016        D1  Bambang Harjito                2   
5          R4      2017        D5       Abdul Aziz                1   
6          R4      2017        D8     Esti Suryani                2   
7          R5      2018        D5       Abdul Aziz                1   
8          R6      2018        D5       Abdul Aziz                1   
9          R7      2018        D2          Wiranto                1   

   author_weight topik_dominan  nilai_topik_dominan  
0            0.6       topic_7             0.376329  
1            0.4       topic_7             0.376329  
2            1.0       topic_3             0.344286  
3 

In [None]:
# Buat list nama kolom topik
topic_cols = [f"topic_{i+1}" for i in range(topic_matrix_expert.shape[1])]

# Ambil top-3 topik per baris
top3_topics = topic_df[topic_cols].apply(lambda row: row.sort_values(ascending=False).index[:3].tolist(), axis=1)
top3_values = topic_df[topic_cols].apply(lambda row: row.sort_values(ascending=False).values[:3].tolist(), axis=1)

# Masukkan ke dataframe
topic_df["topik_1"] = top3_topics.str[0]
topic_df["topik_2"] = top3_topics.str[1]
topic_df["topik_3"] = top3_topics.str[2]

topic_df["nilai_1"] = top3_values.str[0]
topic_df["nilai_2"] = top3_values.str[1]
topic_df["nilai_3"] = top3_values.str[2]

# Tambahkan labelnya dari topik_label
topic_df["label_1"] = topic_df["topik_1"].map(topik_label)
topic_df["label_2"] = topic_df["topik_2"].map(topik_label)
topic_df["label_3"] = topic_df["topik_3"].map(topik_label)

print(topic_df[[
    "name", "expert_id",
    "topik_1", "nilai_1", "label_1",
    "topik_2", "nilai_2", "label_2",
    "topik_3", "nilai_3", "label_3"
]].head(10))



              name expert_id   topik_1   nilai_1  \
0          Wiharto       D12   topic_7  0.376329   
1       Abdul Aziz        D5   topic_7  0.376329   
2       Abdul Aziz        D5   topic_3  0.344286   
3       Abdul Aziz        D5   topic_7  0.546399   
4  Bambang Harjito        D1   topic_7  0.546399   
5       Abdul Aziz        D5   topic_8  0.664380   
6     Esti Suryani        D8   topic_8  0.664380   
7       Abdul Aziz        D5   topic_3  0.844152   
8       Abdul Aziz        D5   topic_8  0.302608   
9          Wiranto        D2  topic_11  0.409660   

                                             label_1   topik_2   nilai_2  \
0    Diagnosis Penyakit Menggunakan Machine Learning  topic_10  0.297490   
1    Diagnosis Penyakit Menggunakan Machine Learning  topic_10  0.297490   
2       Data Mining untuk E-Commerce dan Rekomendasi   topic_6  0.274839   
3    Diagnosis Penyakit Menggunakan Machine Learning   topic_3  0.382559   
4    Diagnosis Penyakit Menggunakan Machine Lea

In [None]:
# Ubah ke format long (1 dosen x 3 topik = 3 baris)
top3_long = pd.wide_to_long(
    topic_df,
    stubnames=["topik", "nilai", "label"],
    i=["name", "expert_id", "research_id"], # Added research_id here
    j="rank",
    sep="_",
    suffix="\d"
).reset_index()

# Urutkan jika perlu
top3_long = top3_long.sort_values(["name", "rank"])
print(top3_long.head(10))
top3_long.to_csv('/content/drive/MyDrive/Skripsi4/dictionary/labeltopik14baru_rank.csv', index=True)


          name expert_id research_id  rank  \
3   Abdul Aziz        D5          R1     1   
6   Abdul Aziz        D5          R2     1   
9   Abdul Aziz        D5          R3     1   
15  Abdul Aziz        D5          R4     1   
21  Abdul Aziz        D5          R5     1   
24  Abdul Aziz        D5          R6     1   
30  Abdul Aziz        D5          R7     1   
33  Abdul Aziz        D5          R8     1   
45  Abdul Aziz        D5          R9     1   
48  Abdul Aziz        D5         R10     1   

                                  label_topik_dominan  nilai_topik_dominan  \
3     Diagnosis Penyakit Menggunakan Machine Learning             0.376329   
6        Data Mining untuk E-Commerce dan Rekomendasi             0.344286   
9     Diagnosis Penyakit Menggunakan Machine Learning             0.546399   
15  Sistem Pendukung Keputusan dalam Dunia Pendidikan             0.664380   
21       Data Mining untuk E-Commerce dan Rekomendasi             0.844152   
24  Sistem Pendukung Kepu

In [None]:
topic_matrix_proposal = np.vstack(proposal_df["topic_vector"])
topic_proposal_df = pd.DataFrame(topic_matrix_proposal, columns=[f"topic_{i+1}" for i in range(lda_model.num_topics)])
topic_proposal_df["proposal_id"] = proposal_df["proposal_id"].values

# Set kolom id_dosen sebagai index
topic_proposal_df.set_index("proposal_id", inplace=True)
print(topic_proposal_df.head())

# print(topic_proposal_df)
# topic_proposal_df.to_csv('/content/drive/MyDrive/Skripsi4/dictionary/vektorProposal_14_baru.csv', index=True)

              topic_1   topic_2   topic_3   topic_4   topic_5   topic_6  \
proposal_id                                                               
P1           0.000580  0.000521  0.000403  0.001298  0.000521  0.000451   
P10          0.000347  0.703296  0.089274  0.000321  0.000346  0.001877   
P11          0.751460  0.000321  0.005268  0.000265  0.160344  0.000319   
P12          0.037477  0.000466  0.000384  0.000409  0.069484  0.002810   
P13          0.001201  0.000586  0.000490  0.000477  0.000512  0.000465   

              topic_7   topic_8   topic_9  topic_10  topic_11  topic_12  \
proposal_id                                                               
P1           0.000380  0.000501  0.000876  0.000609  0.010455  0.000578   
P10          0.000502  0.130369  0.000378  0.000491  0.000416  0.000290   
P11          0.000353  0.003129  0.000395  0.004616  0.000436  0.072385   
P12          0.000346  0.000331  0.000339  0.000364  0.000427  0.886446   
P13          0.000497  0

In [None]:
# Buat kolom topik dominan dan nilainya
topic_proposal_df["topik_dominan"] = topic_proposal_df[[f"topic_{i+1}" for i in range(lda_model.num_topics)]].idxmax(axis=1)
topic_proposal_df["nilai_topik_dominan"] = topic_proposal_df[[f"topic_{i+1}" for i in range(lda_model.num_topics)]].max(axis=1)
topic_proposal_df["label_topik_dominan"] = topic_proposal_df["topik_dominan"].map(topik_label)
print(topic_proposal_df.head())
topic_proposal_df.to_csv('/content/drive/MyDrive/Skripsi4/dictionary/vektorProposal_topikdominan_14_baru.csv', index=True)

              topic_1   topic_2   topic_3   topic_4   topic_5   topic_6  \
proposal_id                                                               
P1           0.000580  0.000521  0.000403  0.001298  0.000521  0.000451   
P10          0.000347  0.703296  0.089274  0.000321  0.000346  0.001877   
P11          0.751460  0.000321  0.005268  0.000265  0.160344  0.000319   
P12          0.037477  0.000466  0.000384  0.000409  0.069484  0.002810   
P13          0.001201  0.000586  0.000490  0.000477  0.000512  0.000465   

              topic_7   topic_8   topic_9  topic_10  topic_11  topic_12  \
proposal_id                                                               
P1           0.000380  0.000501  0.000876  0.000609  0.010455  0.000578   
P10          0.000502  0.130369  0.000378  0.000491  0.000416  0.000290   
P11          0.000353  0.003129  0.000395  0.004616  0.000436  0.072385   
P12          0.000346  0.000331  0.000339  0.000364  0.000427  0.886446   
P13          0.000497  0

# Kemiripan

In [None]:
proposal_df["tahun"] = proposal_df["proposal_year"].astype(int)
expert_df["pub_year"] = expert_df["research_pub_year"].astype(int)


def similarity_m_to_d(proposal_vector, expert_vector):
    numerator = np.dot(proposal_vector, expert_vector)
    denominator = (np.linalg.norm(proposal_vector) + 1e-10)
    return numerator / (denominator)


def similarity_d_to_m(expert_vector, proposal_vector):
    numerator = np.dot(expert_vector, proposal_vector)
    denominator = (np.linalg.norm(expert_vector) + 1e-10)
    return numerator / (denominator)

 # denominator = np.linalg.norm(expert_vector)

def time_decay(year_prop, year_ex, t=1, gamma=0.1):
    decay = 1 - ((year_prop - year_ex) / t) * gamma
    return max(decay, 0.0)  # tidak boleh negatif

# mapping Dosen dengan ID Dosen

dosen_id_map = pd.read_csv("/content/drive/MyDrive/Skripsi3/Dataset/mapping.csv")  # pastikan kolom: expert_id, expert_name
dosen_id_map["expert_name"] = dosen_id_map["expert_name"].str.strip().str.lower()


def explode_authors_with_weights(df, dosen_id_map):
    rows = []

    # Normalisasi nama dosen agar cocok
    dosen_id_map = dosen_id_map.copy()
    dosen_id_map["expert_name"] = dosen_id_map["expert_name"].str.strip().str.lower()

    for _, row in df.iterrows():
        authors = row.get("authors", [])
        # Bersihkan nama kosong atau NaN
        authors = [a for a in authors if isinstance(a, str) and a.strip() != ""]

        num_authors = len(authors)
        for idx, name in enumerate(authors):
            name_clean = name.strip().lower()

            if num_authors == 1:
                weight = 1.0
            else:
                weight = 0.6 if idx == 0 else 0.4 / (num_authors - 1)

            new_row = row.to_dict()
            new_row["name"] = name
            new_row["author_position"] = idx + 1
            new_row["num_authors"] = num_authors
            new_row["author_weight"] = round(weight, 4)

            matched = dosen_id_map[dosen_id_map["expert_name"] == name_clean]
            new_row["expert_id"] = matched["expert_id"].values[0] if not matched.empty else None

            rows.append(new_row)

    return pd.DataFrame(rows)


# Gabungkan author_1 sampai author_6 jadi list
author_cols = ["author_1", "author_2", "author_3", "author_4", "author_5", "author_6"]
expert_df["authors"] = expert_df[author_cols].values.tolist()

# Hapus duplikat berdasarkan research_id
expert_df = expert_df.drop_duplicates(subset=["research_id"]).copy()

# Jalankan explode
expert_df = explode_authors_with_weights(expert_df, dosen_id_map)

# Opsional: hanya simpan baris dengan expert_id valid
expert_df = expert_df[expert_df["expert_id"].notna()]




## Directed M->D

In [None]:
def compute_od_m_to_d(proposals_df, experts_df):
    all_results = []

    for _, proposal in proposals_df.iterrows():
        proposal_vector = proposal["topic_vector"]
        mahasiswa = proposal["student_id"]
        id_proposal = proposal["proposal_id"]
        tahun_proposal = proposal["tahun"]

        for _, expert in expert_df.iterrows():
            expert_vector = expert["topic_vector"]
            dosen = expert["name"]
            id_dosen = expert["expert_id"]
            id_penelitian = expert["research_id"]
            tahun_penelitian = expert["pub_year"]
            weight = expert.get("author_weight")
            position= expert["author_position"]

            sim_mahasiswa = similarity_m_to_d(proposal_vector, expert_vector)
            score = sim_mahasiswa * weight

            all_results.append({
                "id_proposal": id_proposal,
                "id_penelitian": id_penelitian,
                "mahasiswa": mahasiswa,
                "dosen": dosen,
                "id_dosen": id_dosen,
                "posisi_author": position,
                "author_weight":weight,
                "tahun_proposal": tahun_proposal,
                "tahun_penelitian": tahun_penelitian,
                "OD(M→D)": round(score, 4)
            })
    df_scores = pd.DataFrame(all_results)
    # df_scores = df_scores.loc[df_scores.groupby(["id_proposal", "id_dosen"])["OD(M→D)"].idxmax()].reset_index(drop=True)
    df_scores = df_scores.sort_values(by=["id_proposal", "OD(M→D)"], ascending=[True, False])
    return df_scores


In [None]:
od_m2d_df = compute_od_m_to_d(proposal_df, expert_df)
# print(od_m2d_df[["id_proposal", "mahasiswa","id_penelitian","dosen","id_dosen","posisi_author","author_weight", "OD(M→D)"]].head(30))

# od_m2d_df.to_csv("/content/drive/MyDrive/Skripsi4/topik/directed/14_baru/od_m2d_df_topik.csv", index=False)

## TOD M->D

In [None]:
# Hitung TOD(M→D)
def compute_tod_m_to_d(od_df, t=1, gamma=0.1, max_year_diff=5):
    od_df = od_df.copy()

    # Hitung selisih tahun
    od_df["selisih_tahun"] = od_df["tahun_proposal"] - od_df["tahun_penelitian"]

    # Filter: proposal harus lebih baru dari publikasi, dan selisih maksimal 5 tahun
    od_df = od_df[(od_df["selisih_tahun"] >= 0) & (od_df["selisih_tahun"] <= max_year_diff)].copy()

    # Hitung time decay factor
    od_df["time_decay_factor"] = od_df.apply(
        lambda row: time_decay(row["tahun_proposal"], row["tahun_penelitian"], t=t, gamma=gamma),
        axis=1
    )

    # Hitung TOD(M→D)
    od_df["TOD(M→D)"] = (od_df["OD(M→D)"] * od_df["time_decay_factor"]).round(4)

    return od_df



# # Hitung TOD(M→D) sebelum di sortir
# tod_m2d_before_sorted= compute_tod_m_to_d(od_m2d_df, t=1, gamma=0.1)

# # Lihat contoh hasil
# print(tod_m2d_before_sorted[["id_proposal", "id_penelitian","dosen","id_dosen", "OD(M→D)", "tahun_proposal","tahun_penelitian","selisih_tahun", "time_decay_factor", "TOD(M→D)"]].head(5))

# tod_m2d_before_sorted.to_csv("/kaggle/working/tod_m2d_before_sorted.csv", index=False)



In [None]:
# Hitung TOD(M→D) Disortir
tod_m2d_df= compute_tod_m_to_d(od_m2d_df, t=1, gamma=0.1, max_year_diff=5)

tod_m2d_df = (
    tod_m2d_df
    .loc[tod_m2d_df.groupby(["id_proposal", "id_dosen"])["TOD(M→D)"].idxmax()]
    .reset_index(drop=True)
    .sort_values(by=["id_proposal", "TOD(M→D)"], ascending=[True, False])
)


# Lihat contoh hasil

# print(tod_m2d_df[["id_proposal","mahasiswa", "id_penelitian","dosen","id_dosen", "OD(M→D)", "tahun_proposal","tahun_penelitian","selisih_tahun", "time_decay_factor", "TOD(M→D)"]].head(5))

# tod_m2d_df.to_csv("/kaggle/working/tod_m2d_df.csv", index=False)
# tod_m2d_df.to_csv("/content/drive/MyDrive/Skripsi4/topik/directed/14_baru/tod_m2d_df_topik.csv", index=False)

In [None]:
# dupes_todm_d = tod_m2d_df[tod_m2d_df.duplicated(subset=["id_proposal","dosen", "TOD(M→D)"], keep=False)]
# print(dupes_todm_d)

# OD D->M

In [None]:
def compute_od_d_to_m(proposals_df, experts_df):
    results = []

    for _, expert in experts_df.iterrows():
        expert_vector = expert["topic_vector"]
        dosen = expert["name"]
        id_dosen = expert["expert_id"]
        id_penelitian = expert["research_id"]
        weight = expert["author_weight"]
        position = expert["author_position"]
        tahun_penelitian = expert["pub_year"]

        for _, proposal in proposals_df.iterrows():
            proposal_vector = proposal["topic_vector"]
            mahasiswa = proposal["student_id"]
            id_proposal = proposal["proposal_id"]
            tahun_proposal = proposal["tahun"]

            # # Tambahkan filter tahun
            selisih_tahun = tahun_proposal - tahun_penelitian
            if 0 < selisih_tahun <= 5 and tahun_penelitian <= tahun_proposal:
               sim_dosen = similarity_d_to_m(expert_vector, proposal_vector)
               score = sim_dosen * weight
               results.append({
                    "id_proposal": id_proposal,
                    "id_penelitian": id_penelitian,
                    "mahasiswa": mahasiswa,
                    "dosen": dosen,
                    "id_dosen": id_dosen,
                    "posisi_author": position,
                    "author_weight": weight,
                    "tahun_proposal": tahun_proposal,
                    "tahun_penelitian": tahun_penelitian,
                    "OD(D→M)": round(score, 4),
                })

    df_scores = pd.DataFrame(results)
    df_scores = df_scores.loc[df_scores.groupby(["id_proposal","id_dosen"])["OD(D→M)"].idxmax()].reset_index(drop=True)
    df_scores = df_scores.sort_values(by=["id_dosen", "OD(D→M)"], ascending=[True, False])

    return df_scores


In [None]:
#Setelah di Sorted
# od_d2m_df = (
#     od_d2m_before_sorted
#     .sort_values(by=["id_proposal", "OD(D→M)"], ascending=[True, False])
#     # .drop_duplicates(subset=["id_proposal"], keep="first")  #nnti comment ini
#     .reset_index(drop=True)
# )

# Filter nilai di atas 0.3
# od_d2m_df = od_d2m_df.loc[od_d2m_df.groupby(["id_proposal", "id_dosen"])["OD(D→M)"].idxmax()].reset_index(drop=True).sort_values(by=["id_proposal", "OD(D→M)"], ascending=[True, False])

od_d2m_df = compute_od_d_to_m(proposal_df, expert_df)
# print(od_d2m_df[["id_proposal", "mahasiswa","id_penelitian", "dosen", "id_dosen","posisi_author","author_weight","OD(D→M)"]].head(5))

# od_d2m_df.to_csv("/kaggle/working/od_d2m_df.csv", index=False)
# od_d2m_df.to_csv("/content/drive/MyDrive/Skripsi4/topik/directed/14_baru/od_d2m_df_topik.csv", index=False)


# Overlap

In [None]:
# def combine_overlap_scores(df_m2d, df_d2m):
#     # Ambil Top-N dari masing-masing arah
#     top_m2d = df_m2d.groupby("id_proposal")[["id_proposal","mahasiswa", "id_dosen", "dosen", "TOD(M→D)"]]
#     top_d2m = df_d2m.groupby("id_proposal")[["id_proposal","mahasiswa", "id_dosen", "dosen", "OD(D→M)"]]

#     # Outer join untuk semua pasangan top-n
#     merged = pd.merge(top_m2d, top_d2m, on=["id_proposal", "id_dosen"], how="outer")

#     # Tambahkan kolom nama dosen jika hilang (dari M→D arah saja)
#     if "dosen_x" in merged.columns:
#         merged["dosen"] = merged["dosen_x"].combine_first(merged.get("dosen_y"))
#     elif "dosen" not in merged.columns:
#         merged["dosen"] = None

#     if "mahasiswa_x" in merged.columns:
#         merged["mahasiswa"] = merged["mahasiswa_x"].combine_first(merged.get("mahasiswa_y"))
#     elif "mahasiswa" not in merged.columns:
#         merged["mahasiswa"] = None


#     # Ganti NaN skor dengan 0 agar bisa dihitung rata-ratanya
#     merged["TOD(M→D)"] = merged["TOD(M→D)"].fillna(0)
#     merged["OD(D→M)"] = merged["OD(D→M)"].fillna(0)

#     # Tandai overlap jika dosen muncul di kedua arah
#     merged["overlap"] = (merged["TOD(M→D)"] > 0) & (merged["OD(D→M)"] > 0)

#     # Hitung skor rata-rata (hanya jika overlap)
#     merged["skor_rata2"] = merged.apply(
#         lambda row: (row["TOD(M→D)"] + row["OD(D→M)"]) / 2 if row["overlap"] else 0, axis=1
#     )

#     # Ambil skor tertinggi per proposal dan dosen
#     final_scores = merged.sort_values(by=["id_proposal", "skor_rata2"], ascending=[True, False])
#     final_scores = final_scores[["id_proposal", "mahasiswa","dosen", "id_dosen", "TOD(M→D)","OD(D→M)", "skor_rata2", "overlap"]]

#     return final_scores

# df_final = combine_overlap_scores(tod_m2d_df, od_d2m_df)
# # df_final = combine_overlap_scores(tod_m2d_before_sorted, od_d2m_before_sorted, top_n=10)

# # idx = df_final.groupby(["id_proposal", "id_dosen"])["skor_rata2"].idxmax()
# # df_final = df_final.loc[idx]

# # print(df_final.head(5))

# df_final.to_csv("/content/drive/MyDrive/Skripsi3/topik/directedbaru/30/overlap_directed_topik.csv", index=False)

In [None]:
import pandas as pd

def combine_overlap_scores(df_m2d, df_d2m):
    # Ambil semua kolom yang dibutuhkan dari masing-masing arah
    top_m2d = df_m2d[["id_proposal", "mahasiswa", "id_dosen", "dosen", "TOD(M→D)"]]
    top_d2m = df_d2m[["id_proposal", "mahasiswa", "id_dosen", "dosen", "OD(D→M)"]]

    # Gabungkan kedua dataframe berdasarkan id_proposal dan id_dosen
    merged = pd.merge(top_m2d, top_d2m, on=["id_proposal", "id_dosen"], how="outer", suffixes=('_m2d', '_d2m'))

    # Gabungkan kolom nama mahasiswa dan dosen (dari salah satu sisi)
    merged["mahasiswa"] = merged["mahasiswa_m2d"].combine_first(merged["mahasiswa_d2m"])
    merged["dosen"] = merged["dosen_m2d"].combine_first(merged["dosen_d2m"])

    # Ganti NaN skor dengan 0
    merged["TOD(M→D)"] = merged["TOD(M→D)"].fillna(0)
    merged["OD(D→M)"] = merged["OD(D→M)"].fillna(0)

    # Tandai overlap jika skor dari dua arah ada (lebih dari 0)
    merged["overlap"] = (merged["TOD(M→D)"] > 0) & (merged["OD(D→M)"] > 0)

    # Hitung skor rata-rata hanya jika overlap, jika tidak maka 0
    merged["skor_rata2"] = merged.apply(
        lambda row: (row["TOD(M→D)"] + row["OD(D→M)"]) / 2 if row["overlap"] else 0, axis=1
    )

    # Urutkan berdasarkan skor rata-rata tertinggi untuk setiap proposal
    final_scores = merged.sort_values(by=["id_proposal", "skor_rata2"], ascending=[True, False])

    # Pilih kolom akhir yang relevan
    final_scores = final_scores[[
        "id_proposal", "mahasiswa", "dosen", "id_dosen",
        "TOD(M→D)", "OD(D→M)", "skor_rata2", "overlap"
    ]]

    return final_scores

# Contoh penggunaan:
df_final = combine_overlap_scores(tod_m2d_df, od_d2m_df)
# df_final.to_csv("/content/drive/MyDrive/Skripsi4/topik/directed/14_baru/overlap_directed_topik.csv", index=False)


# Ranking

In [None]:
def combine_overlap_scores_with_ranking(df_m2d, df_d2m):
    # Ambil semua skor dari kedua arah
    m2d = df_m2d[["id_proposal", "id_dosen","mahasiswa", "dosen", "TOD(M→D)"]]
    d2m = df_d2m[["id_proposal","id_dosen","mahasiswa","dosen", "OD(D→M)"]]

    # Outer join agar semua kombinasi muncul
    merged = pd.merge(m2d, d2m, on=["id_proposal", "id_dosen"], how="outer")

    # Tambahkan kolom nama dosen jika hilang (dari M→D arah saja)
    if "dosen_x" in merged.columns:
        merged["dosen"] = merged["dosen_x"].combine_first(merged.get("dosen_y"))
    elif "dosen" not in merged.columns:
        merged["dosen"] = None

    if "mahasiswa_x" in merged.columns:
        merged["mahasiswa"] = merged["mahasiswa_x"].combine_first(merged.get("mahasiswa_y"))
    elif "mahasiswa" not in merged.columns:
        merged["mahasiswa"] = None

    # Isi nilai NaN dengan 0 untuk penggabungan skor
    merged["TOD(M→D)"] = merged["TOD(M→D)"].fillna(0)
    merged["OD(D→M)"] = merged["OD(D→M)"].fillna(0)

    # Overlap = muncul di kedua arah
    merged["overlap"] = (merged["TOD(M→D)"] > 0) & (merged["OD(D→M)"] > 0)

    # Skor rata-rata jika overlap
    merged["skor_rata2"] = merged.apply(
        lambda row: (row["TOD(M→D)"] + row["OD(D→M)"]) / 2 if row["overlap"] else 0, axis=1
    )

       # Hitung ranking per proposal berdasarkan skor rata-rata (tanpa groupby + agg)
    merged["rank"] = merged.groupby("id_proposal")["skor_rata2"]\
                           .rank(ascending=False, method="dense")\
                           .astype(int)

    # Ambil kolom yang diinginkan dan urutkan
    result = merged.sort_values(by=["id_proposal", "rank"])[
        ["id_proposal","mahasiswa", "dosen", "id_dosen", "TOD(M→D)", "OD(D→M)", "skor_rata2", "overlap", "rank"]
    ]


    return result.sort_values(by=["id_proposal", "rank"])[
        ["id_proposal","mahasiswa", "dosen", "id_dosen", "TOD(M→D)", "OD(D→M)", "skor_rata2", "overlap", "rank"]
    ]


df_peringkat = combine_overlap_scores_with_ranking(tod_m2d_df, od_d2m_df)
# Filter hanya yang overlap == True
df_overlap_true = df_peringkat[df_peringkat["overlap"] == True]
# print(df_overlap_true.head(20))

# df_overlap_true.to_csv("/content/drive/MyDrive/Skripsi4/topik/directed/14_baru/rank_overlap_true_kata.csv", index=False)


In [None]:
from collections import defaultdict

from collections import defaultdict

# 1. Inisialisasi count untuk rank 1
rank1_count_directed = defaultdict(int)
final_assignment_directed = []

# 2. Tetapkan rank 1 dengan batas 15 kali per dosen
for pid in df_overlap_true["id_proposal"].unique():
    candidates = df_overlap_true[df_overlap_true["id_proposal"] == pid]
    candidates = candidates.sort_values(by="skor_rata2", ascending=False)

    assigned_rank1 = False
    for _, row in candidates.iterrows():
        dosen_id = row["id_dosen"]
        if rank1_count_directed[dosen_id] < 15:
            rank1_count_directed[dosen_id] += 1
            row_data = row.to_dict()
            row_data["rank"] = 1
            row_data["beban"] = rank1_count_directed[dosen_id]
            final_assignment_directed.append(row_data)
            assigned_rank1 = True
            break

    if not assigned_rank1:
        row = candidates.iloc[0].to_dict()
        dosen_id = row["id_dosen"]
        rank1_count_directed[dosen_id] += 1
        row["rank"] = 1
        row["beban"] = rank1_count_directed[dosen_id]
        final_assignment_directed.append(row)

# 3. Buat dataframe dari rank 1
rank1_directed = pd.DataFrame(final_assignment_directed)

# 4. Tambahkan rank 2–17 berdasarkan similarity, excl. dosen yang sudah dipakai di rank 1 untuk proposal yang sama
other_ranks = []

for pid in df_overlap_true["id_proposal"].unique():
    # Dapatkan dosen yang sudah dipakai sebagai rank 1
    used_dosen = rank1_directed[rank1_directed["id_proposal"] == pid]["id_dosen"].tolist()

    # Ambil kandidat lain untuk proposal ini
    candidates = df_overlap_true[(df_overlap_true["id_proposal"] == pid) & (~df_overlap_true["id_dosen"].isin(used_dosen))]
    candidates = candidates.sort_values(by="skor_rata2", ascending=False).reset_index(drop=True)

    for idx, (_, row) in enumerate(candidates.iterrows(), start=2):
        if idx > 17:
            break
        row_data = row.to_dict()
        row_data["rank"] = idx
        row_data["beban"] = rank1_count_directed[row_data["id_dosen"]]  # Beban hanya dihitung dari rank 1
        other_ranks.append(row_data)

# 5. Gabungkan rank1 dan other ranks
df_ranked_filtered = pd.concat([rank1_directed, pd.DataFrame(other_ranks)], ignore_index=True)
df_ranked_filtered = df_ranked_filtered.sort_values(by=["id_proposal", "rank"])

# 🔟 Filter hanya Top 10 dosen per proposal
df_ranked_filtered = df_ranked_filtered[df_ranked_filtered["rank"] <= 17]

# Print the DataFrame
# print(df_ranked_filtered)

# df_ranked_filtered.to_csv("/kaggle/working/df_ranked_filtered_coba.csv", index=False)
# df_ranked_filtered.to_csv("/content/drive/MyDrive/Skripsi4/topik/directed/14_baru/beban_directed_topik.csv", index=False)

# Evaluasi

In [None]:
from sklearn.metrics import precision_score, recall_score

# Load the true labels DataFrame
true_label_df = pd.read_csv("/content/drive/MyDrive/Skripsi3/Dataset/true_labels.csv")

# Gabungkan label benar menjadi list
# true_label_df["true_dosens"] = true_label_df.apply(lambda row: [
#     row["author1"], row["author2"], row["author3"]
# ], axis=1)


# Ubah kolom author, author2, author3 menjadi lowercase
true_label_df["author1"] = true_label_df["examiner_1"].astype(str).str.strip()
true_label_df["author2"] = true_label_df["examiner_2"].astype(str).str.strip()
true_label_df["author3"] = true_label_df["examiner_3"].astype(str).str.strip()



coba

## Evaluasi Baru

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Contoh struktur dummy data untuk demonstrasi perbaikan fungsi (tidak dijalankan secara nyata di sini)
# similarity_cosine_df = pd.read_csv(...)  # Format: id_proposal, rank, similarity_score_akhir, dosen
# true_label_df = pd.read_csv(...)  # Format: proposal_id, author1, author2, author3

def evaluate_ordered_recommendation_directed(rank_all_df, true_label_df, top_ns=[3, 5, 7, 10]):
    summary = []

    for TOP_N in top_ns:
        # Filter Top-N dan urutkan
        top_n_df = rank_all_df[rank_all_df["rank"] <= TOP_N]
        rec_df = top_n_df.sort_values(by=["id_proposal", "rank", "skor_rata2"], ascending=[True, True, False])
        rec_df = rec_df.drop_duplicates(subset=["id_proposal", "rank","skor_rata2"])
        rec_pivot = rec_df.pivot(index="id_proposal", columns="rank", values="dosen").reset_index()
        rec_pivot.columns.name = None
        rec_pivot.columns = ["id_proposal"] + [f"rec_{i}" for i in range(1, len(rec_pivot.columns))]

        # Gabungkan dengan ground truth
        merged_df = pd.merge(
            rec_pivot,
            true_label_df.rename(columns={"proposal_id": "id_proposal"}),
            on="id_proposal",
            how="left"
        )

        # Recall keberadaan (tidak memperhatikan urutan)
        def recall_of_existence(row):
            true_set = {row.get("author1"), row.get("author2"), row.get("author3")}
            pred_set = {row.get(f"rec_{i}") for i in range(1, TOP_N + 1) if row.get(f"rec_{i}")}
            return len(true_set.intersection(pred_set)) / 3

        merged_df[f'recall_of_existence@{TOP_N}'] = merged_df.apply(recall_of_existence, axis=1)

        # Recall per posisi dengan urutan diperhatikan (rec_i harus sama dengan author_i)
        recall_pos = {1: [], 2: [], 3: []}
        for _, row in merged_df.iterrows():
            for pos in [1, 2, 3]:
                examiner = row.get(f'author{pos}')
                rec = row.get(f'rec_{pos}') if pos <= TOP_N else None
                hit = int(pd.notna(examiner) and pd.notna(rec) and examiner == rec)
                recall_pos[pos].append(hit)

        # Tambahkan recall ke DataFrame
        for pos in [1, 2, 3]:
            merged_df[f'recall_pos{pos}_ordered@{TOP_N}'] = recall_pos[pos]

        recall_pos_mean = {pos: np.mean(recall_pos[pos]) for pos in [1, 2, 3]}

        # Euclidean distance antar posisi (penalti posisi meleset)
        distances = []
        for _, row in merged_df.iterrows():
            true_authors = [row.get(f'author{i}') for i in [1, 2, 3]]
            pred_authors = [row.get(f'rec_{i}', None) for i in range(1, TOP_N + 1)]
            distance = 0
            max_penalty = TOP_N
            for i, true_author in enumerate(true_authors):
                if pd.isna(true_author) or true_author == '':
                    continue
                try:
                    pred_pos = pred_authors.index(true_author)
                    pos_diff = pred_pos - i
                    distance += pos_diff ** 2
                except ValueError:
                    distance += max_penalty ** 2
            distances.append(np.sqrt(distance))

        scaler = MinMaxScaler()
        norm_dists = scaler.fit_transform(np.array(distances).reshape(-1, 1)).flatten()
        merged_df[f'norm_euclidean@{TOP_N}'] = norm_dists

        # Ringkasan metrik
        summary.append({
            'Top-N': TOP_N,
            'Mean_Recall_Existence': merged_df[f'recall_of_existence@{TOP_N}'].mean(),
            'Recall_Pos_1_Ordered': recall_pos_mean[1],
            'Recall_Pos_2_Ordered': recall_pos_mean[2],
            'Recall_Pos_3_Ordered': recall_pos_mean[3],
            'Avg_Normalized_Euclidean': np.mean(norm_dists)
        })

    return pd.DataFrame(summary)

result_df_directed = evaluate_ordered_recommendation_directed(df_overlap_true, true_label_df)
# result_df_directed = evaluate_ordered_recommendation_directed(df_ranked_filtered, true_label_df)
print(result_df_directed)


# result_df_directed.to_csv("/content/drive/MyDrive/Skripsi4/topik/directed/14_baru/hasil_directed.csv", index=False)

   Top-N  Mean_Recall_Existence  Recall_Pos_1_Ordered  Recall_Pos_2_Ordered  \
0      3               0.302817               0.34507              0.056338   
1      5               0.443662               0.34507              0.056338   
2      7               0.605634               0.34507              0.056338   
3     10               0.774648               0.34507              0.056338   

   Recall_Pos_3_Ordered  Avg_Normalized_Euclidean  
0              0.091549                  0.842481  
1              0.091549                  0.757989  
2              0.091549                  0.668262  
3              0.091549                  0.551565  


## Evaluasi per proposal Baru

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

def evaluate_per_proposal_directed(rank_all_df, true_label_df, top_n=3):
    # Filter dan urutkan
    top_n_df = rank_all_df[rank_all_df["rank"] <= top_n]
    rec_df = top_n_df.sort_values(by=["id_proposal", "rank", "skor_rata2"], ascending=[True, True, False])
    rec_df = rec_df.drop_duplicates(subset=["id_proposal", "rank","skor_rata2"])
    rec_pivot = rec_df.pivot(index="id_proposal", columns="rank", values="dosen").reset_index()
    rec_pivot.columns.name = None
    rec_pivot.columns = ["id_proposal"] + [f"rec_{i}" for i in range(1, len(rec_pivot.columns))]

    # Gabung dengan label kebenaran
    merged_df = pd.merge(
        rec_pivot,
        true_label_df.rename(columns={"proposal_id": "id_proposal"}),
        on="id_proposal",
        how="left"
    )

    # Recall of existence (abaikan urutan)
    def recall_of_existence(row):
        true_set = {row.get("author1"), row.get("author2"), row.get("author3")}
        pred_set = {row.get(f"rec_{i}") for i in range(1, top_n + 1) if row.get(f"rec_{i}")}
        return len(true_set.intersection(pred_set)) / 3

    merged_df[f'recall_of_existence@{top_n}'] = merged_df.apply(recall_of_existence, axis=1)

    # Recall berdasarkan posisi (urutan harus sama)
    for pos in [1, 2, 3]:
        merged_df[f'recall_pos{pos}_ordered@{top_n}'] = merged_df.apply(
            lambda row: int(
                pd.notna(row.get(f'author{pos}')) and
                pd.notna(row.get(f'rec_{pos}')) and
                row.get(f'author{pos}') == row.get(f'rec_{pos}')
            ) if pos <= top_n else 0,
            axis=1
        )

    # Euclidean distance penalti posisi
    distances = []
    for _, row in merged_df.iterrows():
        true_authors = [row.get(f'author{i}') for i in [1, 2, 3]]
        pred_authors = [row.get(f'rec_{i}', None) for i in range(1, top_n + 1)]
        distance = 0
        max_penalty = top_n
        for i, true_author in enumerate(true_authors):
            if pd.isna(true_author) or true_author == '':
                continue
            try:
                pred_pos = pred_authors.index(true_author)
                pos_diff = pred_pos - i
                distance += pos_diff ** 2
            except ValueError:
                distance += max_penalty ** 2
        distances.append(np.sqrt(distance))

    # Normalisasi jarak
    scaler = MinMaxScaler()
    norm_dists = scaler.fit_transform(np.array(distances).reshape(-1, 1)).flatten()
    merged_df[f'norm_euclidean@{top_n}'] = norm_dists

    # Ambil kolom evaluasi
    result_df = merged_df[["id_proposal",
                           f'recall_of_existence@{top_n}',
                           f'recall_pos1_ordered@{top_n}',
                           f'recall_pos2_ordered@{top_n}',
                           f'recall_pos3_ordered@{top_n}',
                           f'norm_euclidean@{top_n}']].copy()

    return result_df


eval_per_proposal_directed_3 = evaluate_per_proposal_directed(df_overlap_true, true_label_df, top_n=3)
eval_per_proposal_directed_5 = evaluate_per_proposal_directed(df_overlap_true, true_label_df, top_n=5)
eval_per_proposal_directed_7 = evaluate_per_proposal_directed(df_overlap_true, true_label_df, top_n=7)
eval_per_proposal_directed_10 = evaluate_per_proposal_directed(df_overlap_true, true_label_df, top_n=10)

# eval_per_proposal_directed_3.to_csv("/content/drive/MyDrive/Skripsi4/topik/directed/14_baru/hasil_eval_3_directed.csv", index=False)
# eval_per_proposal_directed_5.to_csv("/content/drive/MyDrive/Skripsi4/topik/directed/14_baru/hasil_eval_5_directed.csv", index=False)
# eval_per_proposal_directed_7.to_csv("/content/drive/MyDrive/Skripsi4/topik/directed/14_baru/hasil_eval_7_directed.csv", index=False)
# eval_per_proposal_directed_10.to_csv("/content/drive/MyDrive/Skripsi4/topik/directed/14_baru/hasil_eval_10_directed.csv", index=False)

In [None]:
dupes = df_overlap_true[df_overlap_true.duplicated(subset=["id_proposal", "rank"], keep=False)]
print(dupes)


     id_proposal mahasiswa            dosen id_dosen  TOD(M→D)  OD(D→M)  \
383          P12       S12  Ardhi Wijayanto      D16    0.0177   0.0451   
388          P12       S12    Ristu Saptono       D6    0.0224   0.0404   
715         P139      S139          Winarno      D11    0.0142   0.0387   
721         P139      S139          Wiranto       D2    0.0142   0.0387   
731          P14       S14          Winarno      D11    0.0140   0.0399   
737          P14       S14          Wiranto       D2    0.0140   0.0399   
843          P18       S18          Winarno      D11    0.0141   0.0382   
849          P18       S18          Wiranto       D2    0.0141   0.0382   
1243         P40       S40          Winarno      D11    0.0169   0.0392   
1249         P40       S40          Wiranto       D2    0.0169   0.0392   
1307         P44       S44          Winarno      D11    0.0173   0.0406   
1313         P44       S44          Wiranto       D2    0.0173   0.0406   
1499         P55       S5

# Cosine

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Ubah list topic_vector menjadi array 2D
expert_vectors = np.vstack(expert_df["topic_vector"].values)
proposal_vectors = np.vstack(proposal_df["topic_vector"].values)

# Hitung similarity
similarity_matrix = cosine_similarity(expert_vectors, proposal_vectors)

# Buat label baris dari dosen: gabungkan name, expert_id, research_id
expert_labels = expert_df.apply(
    lambda row: f"{row['name']} ({row['expert_id']}, {row['research_id']})", axis=1
)

# Buat label kolom dari proposal
proposal_labels = proposal_df["proposal_id"].values

# Buat DataFrame dari similarity matrix
similarity_matrix_df = pd.DataFrame(
    similarity_matrix,
    index=expert_labels,
    columns=proposal_labels
)

# Simpan ke CSV
similarity_matrix_df.to_csv("/content/drive/MyDrive/Skripsi4/dictionary/hasil_cosine_matriks_14.csv")


In [None]:
similarity_cosine_df = []

for i, (_, mahasiswa) in enumerate(proposal_df.iterrows()):
    for j, (_, dosen) in enumerate(expert_df.iterrows()):
        # weight = dosen.get("author_weight", 1.0)
        # score_akhir = similarity_matrix[j, i] * weight
        score_akhir = similarity_matrix[j, i]
        similarity_cosine_df.append({
            "id_proposal": mahasiswa["proposal_id"],
            "mahasiswa": mahasiswa["student_id"],
            "id_dosen": dosen["expert_id"],
            "id_penelitian": dosen["research_id"],
            "tahun_proposal": mahasiswa["tahun"],
            "tahun_penelitian": dosen["pub_year"],
            "selisih_tahun": mahasiswa["tahun"] - dosen["pub_year"],
            "dosen": dosen["name"],
           "author_position": dosen["author_position"],
            # "weight": weight,
            "similarity_score" : similarity_matrix[j, i],
            "similarity_score_akhir": score_akhir,   # baris dosen, kolom mahasiswa
        })

similarity_cosine_df = pd.DataFrame(similarity_cosine_df)

# Filter sesuai kondisi
similarity_cosine_df = similarity_cosine_df[
    (similarity_cosine_df["tahun_proposal"] > similarity_cosine_df["tahun_penelitian"]) &
    (similarity_cosine_df["selisih_tahun"] <= 5)
].copy()


# Tampilkan hasil
# print(similarity_cosine_df[["id_proposal","mahasiswa","id_penelitian", "id_dosen","selisih_tahun", "dosen","weight", "similarity_score" ,"similarity_score_akhir",]])
# similarity_cosine_df.to_csv("/content/drive/MyDrive/Skripsi4/topik/cosine/14_baru/hasil_cosine_topik.csv", index=False)


# Pemeringkatan

In [None]:
# Ambil baris dengan similarity tertinggi untuk kombinasi unik id_proposal dan id_dosen
idx = similarity_cosine_df.groupby(["id_proposal", "id_dosen"])["similarity_score_akhir"].idxmax()
similarity_cosine_df = similarity_cosine_df.loc[idx]

# Ranking ulang berdasarkan proposal
similarity_cosine_df["rank"] = similarity_cosine_df.groupby("id_proposal")["similarity_score_akhir"] \
                                     .rank(method="first", ascending=False).astype(int)

# Urutkan
similarity_cosine_df = similarity_cosine_df.sort_values(["id_proposal", "rank"])
similarity_cosine_df = similarity_cosine_df[similarity_cosine_df["rank"] <= 17]


# Tampilkan
# print(similarity_cosine_df[["id_proposal", "mahasiswa", "id_dosen", "dosen", "similarity_score","similarity_score_akhir", "rank"]])
# similarity_cosine_df.to_csv("/content/drive/MyDrive/Skripsi4/topik/cosine/14_baru/hasil_rank_cosine_topik.csv", index=False)

In [None]:
from collections import defaultdict

# 1. Inisialisasi count untuk rank 1
rank1_count = defaultdict(int)
final_assignments = []

# 2. Tetapkan rank 1 dengan batas 15 kali per dosen
for pid in similarity_cosine_df["id_proposal"].unique():
    candidates = similarity_cosine_df[similarity_cosine_df["id_proposal"] == pid]
    candidates = candidates.sort_values(by="similarity_score_akhir", ascending=False)

    assigned_rank1 = False
    for _, row in candidates.iterrows():
        dosen_id = row["id_dosen"]
        if rank1_count[dosen_id] < 15:
            rank1_count[dosen_id] += 1
            row_data = row.to_dict()
            row_data["rank"] = 1
            row_data["beban"] = rank1_count[dosen_id]
            final_assignments.append(row_data)
            assigned_rank1 = True
            break

    if not assigned_rank1:
        row = candidates.iloc[0].to_dict()
        dosen_id = row["id_dosen"]
        rank1_count[dosen_id] += 1
        row["rank"] = 1
        row["beban"] = rank1_count[dosen_id]
        final_assignments.append(row)

# 3. Buat dataframe dari rank 1
rank1_df = pd.DataFrame(final_assignments)

# 4. Tambahkan rank 2–17 berdasarkan similarity, excl. dosen yang sudah dipakai di rank 1 untuk proposal yang sama
other_ranks = []

for pid in similarity_cosine_df["id_proposal"].unique():
    # Dapatkan dosen yang sudah dipakai sebagai rank 1
    used_dosen = rank1_df[rank1_df["id_proposal"] == pid]["id_dosen"].tolist()

    # Ambil kandidat lain untuk proposal ini
    candidates = similarity_cosine_df[(similarity_cosine_df["id_proposal"] == pid) & (~similarity_cosine_df["id_dosen"].isin(used_dosen))]
    candidates = candidates.sort_values(by="similarity_score_akhir", ascending=False).reset_index(drop=True)

    for idx, (_, row) in enumerate(candidates.iterrows(), start=2):
        if idx > 17:
            break
        row_data = row.to_dict()
        row_data["rank"] = idx
        row_data["beban"] = rank1_count[row_data["id_dosen"]]  # Beban hanya dihitung dari rank 1
        other_ranks.append(row_data)

# 5. Gabungkan rank1 dan other ranks
rank_all_df = pd.concat([rank1_df, pd.DataFrame(other_ranks)], ignore_index=True)
rank_all_df = rank_all_df.sort_values(by=["id_proposal", "rank"])

# 6. Tampilkan hasil
# print(rank_all_df[["id_proposal", "mahasiswa", "id_dosen", "dosen", "similarity_score","similarity_score_akhir", "rank", "beban"]])
# rank_all_df.to_csv("/content/drive/MyDrive/Skripsi4/topik/cosine/14_baru/beban_cosine_topik.csv", index=False)

In [None]:
# Urutkan berdasarkan id_proposal, similarity_score_akhir (descending), dan author_position (ascending)
similarity_cosine_df_baru = similarity_cosine_df.sort_values(
    by=["id_proposal", "similarity_score_akhir", "author_position"],
    ascending=[True, False, True]
)

# Tambahkan kolom rank untuk setiap id_proposal
similarity_cosine_df_baru["rank"] = similarity_cosine_df_baru.groupby("id_proposal").cumcount() + 1

# Simpan ke CSV
# similarity_cosine_df_baru.to_csv(
#     "/content/drive/MyDrive/Skripsi4/topik/cosine/14_baru/beban_cosine_topik_sort.csv",
#     index=False
# )



# Evaluasi Cosine

In [None]:
# import pandas as pd
# import numpy as np
# from sklearn.preprocessing import MinMaxScaler

# # Fungsi evaluasi rekomendasi dosen penguji

# import pandas as pd
# import numpy as np
# from sklearn.preprocessing import MinMaxScaler

# # Fungsi evaluasi rekomendasi dosen penguji
# def evaluate_ordered_recommendation_cosine(rank_all_df, true_label_df, top_ns=[3, 5, 7, 10]):
#     summary = []

#     for TOP_N in top_ns:
#         # Ambil Top-N dosen per proposal
#         top_n_df = rank_all_df[rank_all_df["rank"] <= TOP_N]

#         # Ubah menjadi format pivot untuk gabung dengan true_label_df
#         rec_df = top_n_df.sort_values(by=["id_proposal", "rank", "similarity_score_akhir"], ascending=[True, True, False])
#         rec_df = rec_df.drop_duplicates(subset=["id_proposal", "rank"])
#         rec_pivot = rec_df.pivot(index="id_proposal", columns="rank", values="dosen").reset_index()
#         rec_pivot.columns.name = None
#         rec_pivot.columns = ["id_proposal"] + [f"rec_{i}" for i in range(1, len(rec_pivot.columns))]

#         # Gabungkan dengan label ground truth
#         merged_df = pd.merge(
#             rec_pivot,
#             true_label_df.rename(columns={"proposal_id": "id_proposal"}),
#             on="id_proposal",
#             how="left"
#         )

#         # Recall keberadaan (tanpa urutan)
#         def recall_of_existence(row):
#             true_set = {row.get("author1"), row.get("author2"), row.get("author3")}
#             pred_set = {row.get(f"rec_{i}") for i in range(1, TOP_N+1) if row.get(f"rec_{i}")}
#             return len(true_set.intersection(pred_set)) / 3

#         merged_df[f'recall_of_existence@{TOP_N}'] = merged_df.apply(recall_of_existence, axis=1)

#         # Recall posisi urut (ordered position recall)
#         recall_pos = {1: [], 2: [], 3: []}
#         for _, row in merged_df.iterrows():
#             gt = [row.get(f'author{i}') for i in range(1, 4)]
#             pred = [row.get(f'rec_{i}') for i in range(1, TOP_N + 1)]

#             # Posisi kemunculan tiap GT di pred
#             positions = {}
#             for i, g in enumerate(gt):
#                 try:
#                     positions[i + 1] = pred.index(g)
#                 except ValueError:
#                     positions[i + 1] = None

#             # Cek urutan valid (misalnya: pos(author1) < pos(author2) < pos(author3))
#             valid_order = True
#             for i in range(1, 3):
#                 if positions.get(i) is not None and positions.get(i + 1) is not None:
#                     if positions[i] >= positions[i + 1]:
#                         valid_order = False
#                         break

#             for pos in [1, 2, 3]:
#                 hit = int(positions.get(pos) is not None and valid_order)
#                 recall_pos[pos].append(hit)

#         for pos in [1, 2, 3]:
#             merged_df[f'recall_pos{pos}_ordered@{TOP_N}'] = recall_pos[pos]

#         recall_pos_mean = {
#             pos: np.mean(recall_pos[pos]) if recall_pos[pos] else None
#             for pos in [1, 2, 3]
#         }

#         # Euclidean distance antar posisi
#         distances = []
#         for _, row in merged_df.iterrows():
#             true_authors = [row.get(f'author{i}') for i in [1, 2, 3]]
#             pred_authors = [row.get(f'rec_{i}', None) for i in range(1, TOP_N + 1)]
#             distance = 0
#             max_penalty = TOP_N
#             for i, true_author in enumerate(true_authors):
#                 if pd.isna(true_author) or true_author == '':
#                     continue
#                 try:
#                     pred_pos = pred_authors.index(true_author)
#                     pos_diff = pred_pos - i
#                     distance += pos_diff ** 2
#                 except ValueError:
#                     distance += max_penalty ** 2
#             distances.append(np.sqrt(distance))

#         scaler = MinMaxScaler()
#         norm_dists = scaler.fit_transform(np.array(distances).reshape(-1, 1)).flatten()
#         merged_df[f'norm_euclidean@{TOP_N}'] = norm_dists
#         avg_dist = np.mean(norm_dists)
#         avg_recall_exist = merged_df[f'recall_of_existence@{TOP_N}'].mean()

#         summary.append({
#             'Top-N': TOP_N,
#             'Mean_Recall_Existence': avg_recall_exist,
#             'Recall_Pos_1_Ordered': recall_pos_mean[1],
#             'Recall_Pos_2_Ordered': recall_pos_mean[2],
#             'Recall_Pos_3_Ordered': recall_pos_mean[3],
#             'Avg_Normalized_Euclidean': avg_dist
#         })

#     return pd.DataFrame(summary)


# result_eval_cosine = evaluate_ordered_recommendation_cosine(similarity_cosine_df, true_label_df)
# print(result_eval_cosine)



coba

## Evaluasi Baru

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

def evaluate_ordered_recommendation_cosine(rank_all_df, true_label_df, top_ns=[3, 5, 7, 10]):
    summary = []

    for TOP_N in top_ns:
        # Filter Top-N dan urutkan
        top_n_df = rank_all_df[rank_all_df["rank"] <= TOP_N]
        rec_df = top_n_df.sort_values(by=["id_proposal", "rank", "similarity_score_akhir"], ascending=[True, True, False])
        rec_pivot = rec_df.pivot(index="id_proposal", columns="rank", values="dosen").reset_index()
        rec_pivot.columns.name = None
        rec_pivot.columns = ["id_proposal"] + [f"rec_{i}" for i in range(1, len(rec_pivot.columns))]

        # Gabungkan dengan ground truth
        merged_df = pd.merge(
            rec_pivot,
            true_label_df.rename(columns={"proposal_id": "id_proposal"}),
            on="id_proposal",
            how="left"
        )

        # Recall keberadaan (tidak memperhatikan urutan)
        def recall_of_existence(row):
            true_set = {row.get("author1"), row.get("author2"), row.get("author3")}
            pred_set = {row.get(f"rec_{i}") for i in range(1, TOP_N + 1) if row.get(f"rec_{i}")}
            return len(true_set.intersection(pred_set)) / 3

        merged_df[f'recall_of_existence@{TOP_N}'] = merged_df.apply(recall_of_existence, axis=1)

        # Recall per posisi dengan urutan diperhatikan (rec_i harus sama dengan author_i)
        recall_pos = {1: [], 2: [], 3: []}
        for _, row in merged_df.iterrows():
            for pos in [1, 2, 3]:
                examiner = row.get(f'author{pos}')
                rec = row.get(f'rec_{pos}') if pos <= TOP_N else None
                hit = int(pd.notna(examiner) and pd.notna(rec) and examiner == rec)
                recall_pos[pos].append(hit)

        # Tambahkan recall ke DataFrame
        for pos in [1, 2, 3]:
            merged_df[f'recall_pos{pos}_ordered@{TOP_N}'] = recall_pos[pos]

        recall_pos_mean = {pos: np.mean(recall_pos[pos]) for pos in [1, 2, 3]}

        # Euclidean distance antar posisi (penalti posisi meleset)
        distances = []
        for _, row in merged_df.iterrows():
            true_authors = [row.get(f'author{i}') for i in [1, 2, 3]]
            pred_authors = [row.get(f'rec_{i}', None) for i in range(1, TOP_N + 1)]
            distance = 0
            max_penalty = TOP_N
            for i, true_author in enumerate(true_authors):
                if pd.isna(true_author) or true_author == '':
                    continue
                try:
                    pred_pos = pred_authors.index(true_author)
                    pos_diff = pred_pos - i
                    distance += pos_diff ** 2
                except ValueError:
                    distance += max_penalty ** 2
            distances.append(np.sqrt(distance))

        scaler = MinMaxScaler()
        norm_dists = scaler.fit_transform(np.array(distances).reshape(-1, 1)).flatten()
        merged_df[f'norm_euclidean@{TOP_N}'] = norm_dists

        # Ringkasan metrik
        summary.append({
            'Top-N': TOP_N,
            'Mean_Recall_Existence': merged_df[f'recall_of_existence@{TOP_N}'].mean(),
            'Recall_Pos_1_Ordered': recall_pos_mean[1],
            'Recall_Pos_2_Ordered': recall_pos_mean[2],
            'Recall_Pos_3_Ordered': recall_pos_mean[3],
            'Avg_Normalized_Euclidean': np.mean(norm_dists)
        })

    return pd.DataFrame(summary)

result_df = evaluate_ordered_recommendation_cosine(similarity_cosine_df, true_label_df)
print(result_df)
# result_df.to_csv("/content/drive/MyDrive/Skripsi4/topik/cosine/14_baru/result_df.csv", index=False)


   Top-N  Mean_Recall_Existence  Recall_Pos_1_Ordered  Recall_Pos_2_Ordered  \
0      3               0.345070              0.260563              0.105634   
1      5               0.460094              0.260563              0.105634   
2      7               0.593897              0.260563              0.105634   
3     10               0.753521              0.260563              0.105634   

   Recall_Pos_3_Ordered  Avg_Normalized_Euclidean  
0              0.077465                  0.807783  
1              0.077465                  0.734154  
2              0.077465                  0.659702  
3              0.077465                  0.554564  


## per proposal baru

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

def evaluate_per_proposal_cosine(rank_all_df, true_label_df, top_n=3):
    # Filter dan urutkan
    top_n_df = rank_all_df[rank_all_df["rank"] <= top_n]
    rec_df = top_n_df.sort_values(by=["id_proposal", "rank", "similarity_score_akhir"], ascending=[True, True, False])
    rec_pivot = rec_df.pivot(index="id_proposal", columns="rank", values="dosen").reset_index()
    rec_pivot.columns.name = None
    rec_pivot.columns = ["id_proposal"] + [f"rec_{i}" for i in range(1, len(rec_pivot.columns))]

    # Gabung dengan label kebenaran
    merged_df = pd.merge(
        rec_pivot,
        true_label_df.rename(columns={"proposal_id": "id_proposal"}),
        on="id_proposal",
        how="left"
    )

    # Recall of existence (abaikan urutan)
    def recall_of_existence(row):
        true_set = {row.get("author1"), row.get("author2"), row.get("author3")}
        pred_set = {row.get(f"rec_{i}") for i in range(1, top_n + 1) if row.get(f"rec_{i}")}
        return len(true_set.intersection(pred_set)) / 3

    merged_df[f'recall_of_existence@{top_n}'] = merged_df.apply(recall_of_existence, axis=1)

    # Recall berdasarkan posisi (urutan harus sama)
    for pos in [1, 2, 3]:
        merged_df[f'recall_pos{pos}_ordered@{top_n}'] = merged_df.apply(
            lambda row: int(
                pd.notna(row.get(f'author{pos}')) and
                pd.notna(row.get(f'rec_{pos}')) and
                row.get(f'author{pos}') == row.get(f'rec_{pos}')
            ) if pos <= top_n else 0,
            axis=1
        )

    # Euclidean distance penalti posisi
    distances = []
    for _, row in merged_df.iterrows():
        true_authors = [row.get(f'author{i}') for i in [1, 2, 3]]
        pred_authors = [row.get(f'rec_{i}', None) for i in range(1, top_n + 1)]
        distance = 0
        max_penalty = top_n
        for i, true_author in enumerate(true_authors):
            if pd.isna(true_author) or true_author == '':
                continue
            try:
                pred_pos = pred_authors.index(true_author)
                pos_diff = pred_pos - i
                distance += pos_diff ** 2
            except ValueError:
                distance += max_penalty ** 2
        distances.append(np.sqrt(distance))

    # Normalisasi jarak
    scaler = MinMaxScaler()
    norm_dists = scaler.fit_transform(np.array(distances).reshape(-1, 1)).flatten()
    merged_df[f'norm_euclidean@{top_n}'] = norm_dists

    # Ambil kolom evaluasi
    result_df = merged_df[["id_proposal",
                           f'recall_of_existence@{top_n}',
                           f'recall_pos1_ordered@{top_n}',
                           f'recall_pos2_ordered@{top_n}',
                           f'recall_pos3_ordered@{top_n}',
                           f'norm_euclidean@{top_n}']].copy()

    return result_df


eval_per_proposal_cosine_3 = evaluate_per_proposal_cosine(similarity_cosine_df, true_label_df, top_n=3)
eval_per_proposal_cosine_5 = evaluate_per_proposal_cosine(similarity_cosine_df, true_label_df, top_n=5)
eval_per_proposal_cosine_7 = evaluate_per_proposal_cosine(similarity_cosine_df, true_label_df, top_n=7)
eval_per_proposal_cosine_10 = evaluate_per_proposal_cosine(similarity_cosine_df, true_label_df, top_n=10)

# eval_per_proposal_cosine_3.to_csv("/content/drive/MyDrive/Skripsi4/topik/cosine/14_baru/hasil_eval_3_cosine_topik.csv", index=False)
# eval_per_proposal_cosine_5.to_csv("/content/drive/MyDrive/Skripsi4/topik/cosine/14_baru/hasil_eval_5_cosine_topik.csv", index=False)
# eval_per_proposal_cosine_7.to_csv("/content/drive/MyDrive/Skripsi4/topik/cosine/14_baru/hasil_eval_7_cosine_topik.csv", index=False)
# eval_per_proposal_cosine_10.to_csv("/content/drive/MyDrive/Skripsi4/topik/cosine/14_baru/hasil_eval_10_cosine_topik.csv", index=False)