In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, ConfusionMatrixDisplay
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from joblib import dump, load
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression

# 1. Read Data

In [3]:
mhs_data = pd.read_csv('../MHS.csv')

krs_data = pd.read_csv('../KRS.csv')

In [4]:
mhs_data.head()

Unnamed: 0,npm_mahasiswa,nama_mahasiswa,prodi_mahasiswa,angkatan_mahasiswa,ipk_mahasiswa,status_mahasiswa,pembimbing_tugas_akhir
0,1931173,Celine Te,Sistem Informasi,2019,3.87,Lulus,5120028.0
1,1931001,Rido Renando,Sistem Informasi,2019,3.72,Lulus,1170002.0
2,1931009,Richie Marlon,Sistem Informasi,2019,3.9,Lulus,5120028.0
3,1931083,Elvin Whang,Sistem Informasi,2019,3.68,Lulus,6200076.0
4,2031001,Robin Sunjaya,Sistem Informasi,2020,3.88,Aktif,9180074.0


In [5]:
krs_data.head()

Unnamed: 0,npm_mahasiswa,jenis_semester,tahun_semester,kode_kelas,kode_matkul,nama_matkul,sks_matakuliah,total_hadir,total_pertemuan,total_terlaksana,total_tidak_hadir,kode_nilai,kategori_matakuliah
0,1931001,ganjil,2019,1BUMA,SI00194,Agama,2,0,0,0,0,A,Agama
1,1931001,ganjil,2019,1SIMA,SI00153,Arsitektur dan Organisasi Komputer,3,0,0,0,0,A,Computer Hardware
2,1931001,ganjil,2019,1VUMD,SI00192,Bahasa Inggris I,2,0,0,0,0,A,Bahasa
3,1931001,ganjil,2019,1SIMA,SIL0085,Lab. Teknik Pemograman,1,0,0,0,0,A,Basic Programming
4,1931001,ganjil,2019,1SIMA,SI00151,Pengantar Sistem Informasi,3,0,0,0,0,B,Manajemen


# 2. Data PreProcessing

In [6]:
mhs_data = mhs_data[~mhs_data['status_mahasiswa'].isin(['Keluar', 'Drop-out (putus studi)'])]

merged_data = pd.merge(mhs_data, krs_data, on="npm_mahasiswa")
merged_data = merged_data[merged_data['total_terlaksana'] != 0]

merged_data.head()

Unnamed: 0,npm_mahasiswa,nama_mahasiswa,prodi_mahasiswa,angkatan_mahasiswa,ipk_mahasiswa,status_mahasiswa,pembimbing_tugas_akhir,jenis_semester,tahun_semester,kode_kelas,kode_matkul,nama_matkul,sks_matakuliah,total_hadir,total_pertemuan,total_terlaksana,total_tidak_hadir,kode_nilai,kategori_matakuliah
29,1931173,Celine Te,Sistem Informasi,2019,3.87,Lulus,5120028.0,genap,2020,2BAMB,SI00260,Bahasa Mandarin,3,14,14,14,0,B,Bahasa
30,1931173,Celine Te,Sistem Informasi,2019,3.87,Lulus,5120028.0,genap,2020,4DKVB,SI00268,Statistik Untuk Riset,3,14,14,14,0,A,Tugas Akhir
31,1931173,Celine Te,Sistem Informasi,2019,3.87,Lulus,5120028.0,genap,2020,4DKVB,SI00163,Teknologi Game,3,14,14,14,0,A,Game Making
32,1931173,Celine Te,Sistem Informasi,2019,3.87,Lulus,5120028.0,genap,2020,4DKVB,SI00233,Desain Kreatif,3,14,14,14,0,A,Design
33,1931173,Celine Te,Sistem Informasi,2019,3.87,Lulus,5120028.0,genap,2020,4DKVB,SI00208,Teknik Pemasaran Digital,3,14,14,14,0,A,Marketing


# 3. Feature Engineering

In [7]:
grade_map = {'A': 4, 'B': 3, 'C': 2, 'D': 1, 'E': 0}
data = merged_data
data['kode_nilai'] = merged_data['kode_nilai'].map(grade_map)

category_scores = data.groupby(['npm_mahasiswa', 'kategori_matakuliah']).agg(
    avg_ipk=('kode_nilai', 'mean')  # Average IPK
).reset_index()

# Add rank per kategori_matakuliah (ties get the same rank)
category_scores['rank'] = category_scores.groupby('kategori_matakuliah')['avg_ipk'].rank(ascending=False, method='min')

# Count the number of students sharing the same rank
category_scores['rank_density'] = category_scores.groupby(['kategori_matakuliah', 'rank'])['npm_mahasiswa'].transform('count')

# Verify
print(category_scores.head())

   npm_mahasiswa kategori_matakuliah  avg_ipk  rank  rank_density
0        1931001           Animation      4.0   1.0         337.0
1        1931001              Bahasa      4.0   1.0         218.0
2        1931001              Design      4.0   1.0         359.0
3        1931001              Ethics      4.0   1.0         408.0
4        1931001         Game Making      4.0   1.0         167.0


In [8]:
# Compute weights (inverse of rank density)
category_scores['weight'] = 1 / category_scores['rank_density']

# Normalize weights within each kategori_matakuliah
category_scores['normalized_weight'] = category_scores.groupby('kategori_matakuliah')['weight'].transform(lambda x: x / x.sum())

# Verify
print(category_scores.head())


   npm_mahasiswa kategori_matakuliah  avg_ipk  rank  rank_density    weight  \
0        1931001           Animation      4.0   1.0         337.0  0.002967   
1        1931001              Bahasa      4.0   1.0         218.0  0.004587   
2        1931001              Design      4.0   1.0         359.0  0.002786   
3        1931001              Ethics      4.0   1.0         408.0  0.002451   
4        1931001         Game Making      4.0   1.0         167.0  0.005988   

   normalized_weight  
0           0.000156  
1           0.000164  
2           0.000174  
3           0.000163  
4           0.000315  


In [9]:
student_category_weights = category_scores.groupby(['npm_mahasiswa', 'kategori_matakuliah']).agg(
    total_weight=('normalized_weight', 'sum')
).reset_index()

# Determine favorite category per student
student_favorites = student_category_weights.loc[
    student_category_weights.groupby('npm_mahasiswa')['total_weight'].idxmax()
]

# Verify
print(student_favorites)

       npm_mahasiswa kategori_matakuliah  total_weight
11           1931001             Startup      0.012821
20           1931002           Manajemen      0.002331
32           1931003           Manajemen      0.003497
50           1931005          Pariwisata      0.142857
60           1931006           Manajemen      0.002331
...              ...                 ...           ...
10993        2231207   Computer Hardware      0.001592
11016        2231208     Kewarganegaraan      0.031250
11023        2231210   Basic Programming      0.009524
11043        2231212          Basis Data      0.041667
11059        2231213          Basis Data      0.015625

[636 rows x 3 columns]


In [10]:
# Calculate the attendance percentage
merged_data['attendance_percentage'] = (merged_data['total_hadir'] / merged_data['total_pertemuan']) * 100

# Check the calculated attendance percentage
print(merged_data[['npm_mahasiswa', 'total_hadir', 'total_pertemuan', 'attendance_percentage']].head())



    npm_mahasiswa  total_hadir  total_pertemuan  attendance_percentage
29        1931173           14               14                  100.0
30        1931173           14               14                  100.0
31        1931173           14               14                  100.0
32        1931173           14               14                  100.0
33        1931173           14               14                  100.0


In [11]:
# Calculate the average IPK and attendance percentage by kategori_matakuliah and npm_mahasiswa
average_ipk_attendance = merged_data.groupby(['npm_mahasiswa', 'kategori_matakuliah'])[['kode_nilai', 'attendance_percentage']].mean().reset_index()

# Check the result
print(average_ipk_attendance.head())

   npm_mahasiswa kategori_matakuliah  kode_nilai  attendance_percentage
0        1931001           Animation         4.0                  100.0
1        1931001              Bahasa         4.0                  100.0
2        1931001              Design         4.0                  100.0
3        1931001              Ethics         4.0                  100.0
4        1931001         Game Making         4.0                  100.0


In [12]:
# Pivot the data to create separate columns for each kategori_matakuliah
ipk_pivot = average_ipk_attendance.pivot(index='npm_mahasiswa', columns='kategori_matakuliah', values='kode_nilai')
attendance_pivot = average_ipk_attendance.pivot(index='npm_mahasiswa', columns='kategori_matakuliah', values='attendance_percentage')

# Flatten the columns to avoid multi-index
ipk_pivot.columns = [f"ipk_{col}" for col in ipk_pivot.columns]
attendance_pivot.columns = [f"attendance_rate_{col}" for col in attendance_pivot.columns]

# Check the pivoted data
print(ipk_pivot.head())
print(attendance_pivot.head())

               ipk_Agama  ipk_Animation  ipk_Bahasa  ipk_Basic Programming  \
npm_mahasiswa                                                                
1931001              NaN            4.0         4.0                    NaN   
1931002              NaN            4.0         3.5                    NaN   
1931003              NaN            NaN         4.0                    3.5   
1931005              NaN            4.0         3.0                    NaN   
1931006              NaN            4.0         3.5                    NaN   

               ipk_Basis Data  ipk_Computer Hardware  ipk_Design  ipk_Ethics  \
npm_mahasiswa                                                                  
1931001                   NaN                    NaN         4.0    4.000000   
1931002                   NaN                    NaN         4.0    3.666667   
1931003                   NaN                    NaN         NaN    4.000000   
1931005                   NaN                    NaN 

In [13]:
# Merge the pivoted data into MHS dataframe
merged_mhs_data = pd.merge(mhs_data, ipk_pivot, left_on='npm_mahasiswa', right_index=True, how='left')
merged_mhs_data = pd.merge(merged_mhs_data, attendance_pivot, left_on='npm_mahasiswa', right_index=True, how='left')

# Check the final result
print(merged_mhs_data.head())


   npm_mahasiswa nama_mahasiswa   prodi_mahasiswa  angkatan_mahasiswa  \
0        1931173      Celine Te  Sistem Informasi                2019   
1        1931001   Rido Renando  Sistem Informasi                2019   
2        1931009  Richie Marlon  Sistem Informasi                2019   
3        1931083    Elvin Whang  Sistem Informasi                2019   
4        2031001  Robin Sunjaya  Sistem Informasi                2020   

   ipk_mahasiswa status_mahasiswa  pembimbing_tugas_akhir  ipk_Agama  \
0           3.87            Lulus               5120028.0        NaN   
1           3.72            Lulus               1170002.0        NaN   
2           3.90            Lulus               5120028.0        NaN   
3           3.68            Lulus               6200076.0        NaN   
4           3.88            Aktif               9180074.0        NaN   

   ipk_Animation  ipk_Bahasa  ...  attendance_rate_Manajemen   \
0            4.0         3.5  ...                  100.000000  

In [14]:
# Fill NaN values with 0 in the merged dataframe
merged_mhs_data = merged_mhs_data.fillna(0)

# Check the result
print(merged_mhs_data.head())


   npm_mahasiswa nama_mahasiswa   prodi_mahasiswa  angkatan_mahasiswa  \
0        1931173      Celine Te  Sistem Informasi                2019   
1        1931001   Rido Renando  Sistem Informasi                2019   
2        1931009  Richie Marlon  Sistem Informasi                2019   
3        1931083    Elvin Whang  Sistem Informasi                2019   
4        2031001  Robin Sunjaya  Sistem Informasi                2020   

   ipk_mahasiswa status_mahasiswa  pembimbing_tugas_akhir  ipk_Agama  \
0           3.87            Lulus               5120028.0        0.0   
1           3.72            Lulus               1170002.0        0.0   
2           3.90            Lulus               5120028.0        0.0   
3           3.68            Lulus               6200076.0        0.0   
4           3.88            Aktif               9180074.0        0.0   

   ipk_Animation  ipk_Bahasa  ...  attendance_rate_Manajemen   \
0            4.0         3.5  ...                  100.000000  

In [15]:
# Merge student favorites with the existing dataset (merged_data)
the_data = pd.merge(merged_mhs_data, student_favorites, on='npm_mahasiswa', how='left')

print(the_data.head())

   npm_mahasiswa nama_mahasiswa   prodi_mahasiswa  angkatan_mahasiswa  \
0        1931173      Celine Te  Sistem Informasi                2019   
1        1931001   Rido Renando  Sistem Informasi                2019   
2        1931009  Richie Marlon  Sistem Informasi                2019   
3        1931083    Elvin Whang  Sistem Informasi                2019   
4        2031001  Robin Sunjaya  Sistem Informasi                2020   

   ipk_mahasiswa status_mahasiswa  pembimbing_tugas_akhir  ipk_Agama  \
0           3.87            Lulus               5120028.0        0.0   
1           3.72            Lulus               1170002.0        0.0   
2           3.90            Lulus               5120028.0        0.0   
3           3.68            Lulus               6200076.0        0.0   
4           3.88            Aktif               9180074.0        0.0   

   ipk_Animation  ipk_Bahasa  ...  attendance_rate_Mobile Development  \
0            4.0         3.5  ...                      

# 4. Model

In [16]:
# Define the target variable y as kategori_matakuliah
y = the_data['kategori_matakuliah']

# Select all columns starting with 'ipk_' or 'attendance_rate_' for features
X = the_data.filter(like='ipk_').join(the_data.filter(like='attendance_rate_'))

# Check the shapes of X and y
X.head()

Unnamed: 0,ipk_mahasiswa,ipk_Agama,ipk_Animation,ipk_Bahasa,ipk_Basic Programming,ipk_Basis Data,ipk_Computer Hardware,ipk_Design,ipk_Ethics,ipk_Game Making,...,attendance_rate_Manajemen,attendance_rate_Marketing,attendance_rate_Mobile Development,attendance_rate_Modelling,attendance_rate_Movie Making,attendance_rate_Multimedia,attendance_rate_Pariwisata,attendance_rate_Startup,attendance_rate_Tugas Akhir,attendance_rate_Website Making
0,3.87,0.0,4.0,3.5,0.0,0.0,0.0,4.0,4.0,4.0,...,100.0,100.0,0.0,0.0,0.0,0.0,0.0,35.714286,95.238095,0.0
1,3.72,0.0,4.0,4.0,0.0,0.0,0.0,4.0,4.0,4.0,...,100.0,100.0,0.0,100.0,0.0,0.0,0.0,42.857143,95.238095,0.0
2,3.9,0.0,4.0,4.0,0.0,0.0,0.0,4.0,4.0,4.0,...,78.571429,100.0,0.0,100.0,0.0,0.0,0.0,35.714286,92.857143,0.0
3,3.68,0.0,4.0,3.5,0.0,0.0,0.0,4.0,3.666667,4.0,...,100.0,100.0,0.0,100.0,92.857143,0.0,0.0,28.571429,95.238095,0.0
4,3.88,0.0,4.0,3.5,4.0,4.0,0.0,3.75,4.0,4.0,...,100.0,100.0,0.0,100.0,82.142857,0.0,0.0,75.0,100.0,100.0


In [21]:
# Convert categorical target to numeric
# y = pd.Categorical(y).codes
# Define the target variable y as kategori_matakuliah

merged_data_cleaned = the_data.dropna(subset=['kategori_matakuliah'])

y = merged_data_cleaned['kategori_matakuliah']

# Select all columns starting with 'ipk_' or 'attendance_rate_' for features
X = merged_data_cleaned.filter(like='ipk_').join(merged_data_cleaned.filter(like='attendance_rate_'))
X = X.fillna(0)

# Scale the features
scaler = StandardScaler()


feature_names = X.columns
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=feature_names)

# print(X_scaled)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# model = xgb.XGBClassifier(eval_metric='mlogloss')

model = RandomForestClassifier(
    random_state=42,
    n_jobs=3,
    class_weight='balanced',
)

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)


# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)
# 
# 
# model.fit(X_scaled, y_train)


# 5. Evaluation

In [22]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Classification report
report = classification_report(y_test, y_pred, zero_division=1)
print("Classification Report:")
print(report)

Accuracy: 0.5288
Classification Report:
                    precision    recall  f1-score   support

             Agama       0.50      0.50      0.50         2
         Animation       0.29      0.29      0.29         7
            Bahasa       0.00      0.00      0.00         3
 Basic Programming       0.75      0.50      0.60         6
        Basis Data       0.50      1.00      0.67         1
 Computer Hardware       0.55      1.00      0.71        11
            Design       0.25      0.50      0.33         4
            Ethics       0.62      0.50      0.56        10
       Game Making       0.64      0.69      0.67        13
          Hardware       1.00      0.00      0.00         4
             Hukum       0.60      0.46      0.52        13
          Jaringan       1.00      1.00      1.00         5
   Kewarganegaraan       0.50      0.20      0.29         5
  Logical Thinking       0.29      0.25      0.27         8
  Machine Learning       0.56      0.69      0.62        13

In [23]:
dump(model, 'course-preference-classification.joblib')

['course-preference-classification.joblib']

In [30]:
model = load('course-preference-classification.joblib')

def predict_favourite_course(npm, data):

    data_copy = data.copy()

    # Extract features for prediction
    features = data_copy.filter(like='ipk_').join(data_copy.filter(like='attendance_rate_'))
    features = features.fillna(0)  # Handle missing values

    # Get predictions for all students
    predictions = model.predict(features)

    # Add predictions back to the copied data
    data_copy['prediction'] = predictions

    # matched_rows = data_copy[data_copy['prediction'] == data_copy['kategori_matakuliah']]
    # 
    # return matched_rows

    # Rank students based on predictions
    data_copy['rank'] = data_copy['prediction'].rank(method='min', ascending=False)

    # Fetch the row corresponding to the given NPM
    row = data_copy[data_copy['npm_mahasiswa'] == npm]

    if row.empty:
        return f"Data {npm} tidak ditemukan."

    return row['prediction']

test = predict_favourite_course(2231152, merged_data_cleaned)

# test.head()
print(test)

576    Computer Hardware
Name: prediction, dtype: object
