In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, ConfusionMatrixDisplay
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression

# 1. Read Data

In [11]:
mhs_data = pd.read_csv('../MHS.csv')
krs_data = pd.read_csv('../KRS.csv')

In [12]:
mhs_data.head()

Unnamed: 0,npm_mahasiswa,nama_mahasiswa,prodi_mahasiswa,angkatan_mahasiswa,ipk_mahasiswa,status_mahasiswa,pembimbing_tugas_akhir
0,1931173,Celine Te,Sistem Informasi,2019,3.87,Lulus,5120028.0
1,1931001,Rido Renando,Sistem Informasi,2019,3.72,Lulus,1170002.0
2,1931009,Richie Marlon,Sistem Informasi,2019,3.9,Lulus,5120028.0
3,1931083,Elvin Whang,Sistem Informasi,2019,3.68,Lulus,6200076.0
4,2031001,Robin Sunjaya,Sistem Informasi,2020,3.88,Aktif,9180074.0


In [13]:
krs_data.head()

Unnamed: 0,npm_mahasiswa,jenis_semester,tahun_semester,kode_kelas,kode_matkul,nama_matkul,sks_matakuliah,total_hadir,total_pertemuan,total_terlaksana,total_tidak_hadir,kode_nilai,kategori_matakuliah
0,1931001,ganjil,2019,1BUMA,SI00194,Agama,2,0,0,0,0,A,Agama
1,1931001,ganjil,2019,1SIMA,SI00153,Arsitektur dan Organisasi Komputer,3,0,0,0,0,A,Computer Hardware
2,1931001,ganjil,2019,1VUMD,SI00192,Bahasa Inggris I,2,0,0,0,0,A,Bahasa
3,1931001,ganjil,2019,1SIMA,SIL0085,Lab. Teknik Pemograman,1,0,0,0,0,A,Basic Programming
4,1931001,ganjil,2019,1SIMA,SI00151,Pengantar Sistem Informasi,3,0,0,0,0,B,Manajemen


# 2. Data PreProcessing

In [14]:
merged_data = pd.merge(mhs_data, krs_data, on="npm_mahasiswa")
merged_data = merged_data[merged_data['total_terlaksana'] != 0]

merged_data.head()

Unnamed: 0,npm_mahasiswa,nama_mahasiswa,prodi_mahasiswa,angkatan_mahasiswa,ipk_mahasiswa,status_mahasiswa,pembimbing_tugas_akhir,jenis_semester,tahun_semester,kode_kelas,kode_matkul,nama_matkul,sks_matakuliah,total_hadir,total_pertemuan,total_terlaksana,total_tidak_hadir,kode_nilai,kategori_matakuliah
29,1931173,Celine Te,Sistem Informasi,2019,3.87,Lulus,5120028.0,genap,2020,2BAMB,SI00260,Bahasa Mandarin,3,14,14,14,0,B,Bahasa
30,1931173,Celine Te,Sistem Informasi,2019,3.87,Lulus,5120028.0,genap,2020,4DKVB,SI00268,Statistik Untuk Riset,3,14,14,14,0,A,Tugas AKhir
31,1931173,Celine Te,Sistem Informasi,2019,3.87,Lulus,5120028.0,genap,2020,4DKVB,SI00163,Teknologi Game,3,14,14,14,0,A,Game Making
32,1931173,Celine Te,Sistem Informasi,2019,3.87,Lulus,5120028.0,genap,2020,4DKVB,SI00233,Desain Kreatif,3,14,14,14,0,A,Design
33,1931173,Celine Te,Sistem Informasi,2019,3.87,Lulus,5120028.0,genap,2020,4DKVB,SI00208,Teknik Pemasaran Digital,3,14,14,14,0,A,Marketing


# 3. Feature Engineering

In [15]:
merged_data['kode_nilai'] = pd.Categorical(merged_data['kode_nilai']).codes

X = merged_data[['sks_matakuliah', 'total_hadir', 'total_terlaksana', 'kode_nilai']]
y = merged_data['kategori_matakuliah']

# 4. Model

In [16]:
# Convert categorical target to numeric
y = pd.Categorical(y).codes

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print(X_scaled)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# model = xgb.XGBClassifier(eval_metric='mlogloss')

# param_grid = {
#     'n_estimators': [100, 200, 300],
#     'max_depth': [3, 4, 5, 6],
#     'learning_rate': [0.01, 0.1, 0.2],
#     'subsample': [0.8, 1.0],
#     'colsample_bytree': [0.8, 1.0]
# }

# # Create a GridSearchCV object
# grid_search = GridSearchCV(estimator=model, param_grid=param_grid, 
#                         scoring='accuracy', cv=3, verbose=1, n_jobs=-1)

# # Fit the grid search to the data
# grid_search.fit(X_train, y_train)

# # Print the best parameters and best score
# print(f"Best parameters: {grid_search.best_params_}")
# print(f"Best cross-validation accuracy: {grid_search.best_score_}")

# # Use the best estimator to predict
# best_model = grid_search.best_estimator_
# y_pred = best_model.predict(X_test)

model = RandomForestClassifier(random_state=42)

# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 3, 4, 5, 6],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]
}

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, 
                        scoring='accuracy', cv=3, verbose=1, n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation accuracy: {grid_search.best_score_}")

# Use the best estimator to predict
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)


# # Train the model
# model.fit(X_train, y_train)

# # Predict on the test set
# y_pred = model.predict(X_test)


[[ 0.50328813  0.55993311  0.43227493  0.81952125]
 [ 0.50328813  0.55993311  0.43227493 -0.36302872]
 [ 0.50328813  0.55993311  0.43227493 -0.36302872]
 ...
 [ 0.50328813  0.55993311  0.43227493 -1.54557868]
 [-2.5515695  -2.16819807 -2.59860948  3.18462118]
 [ 0.50328813  0.55993311  0.43227493  2.00207121]]
Fitting 3 folds for each of 405 candidates, totalling 1215 fits
Best parameters: {'max_depth': None, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 200}
Best cross-validation accuracy: 0.26448462979879545


# 5. Evaluation

In [17]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Classification report
report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)

Accuracy: 0.2627
Classification Report:
              precision    recall  f1-score   support

           0       0.25      0.02      0.03        66
           1       0.37      0.14      0.20       190
           2       0.57      0.64      0.60       400
           3       0.13      0.72      0.22       394
           4       0.17      0.08      0.11       186
           5       0.60      0.16      0.25        56
           6       0.51      0.24      0.33       256
           7       0.10      0.05      0.07       267
           8       0.37      0.32      0.34       227
           9       0.00      0.00      0.00       121
          10       0.56      0.20      0.29        75
          11       0.00      0.00      0.00         4
          12       0.78      0.48      0.59       189
          13       0.37      0.41      0.39       323
          14       0.60      0.11      0.18       140
          15       0.69      0.16      0.25        58
          16       0.09      0.12      0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
