In [53]:
import pandas as pd 
import psycopg2 as png
from getpass import getpass
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import pickle

In [54]:
myCon = png.connect(
    dbname='examdb',
    user='postgres',
    password='riska06',
    host='localhost',
)

In [55]:
engine = create_engine('postgresql://postgres:riska06@localhost:5432/examdb')

query = """
SELECT
    s.stu_id,                           -- ID pengguna
    s.name AS student_name,              -- Nama pengguna
    s.gender,                            -- Gender pengguna
    e.grade,                             -- Nilai terakhir pengguna
    it.interes_name,                     -- Minat pengguna
    ca.activity_name,                    -- Nama aktivitas yang diikuti
    ca.activity_start_date,              -- Tanggal mulai aktivitas
    ca.activity_end_date,                -- Tanggal selesai aktivitas
    c.course_name,                       -- Nama kursus
    c.course_id,                         -- ID kursus
    ca.type_id,                           -- ID jenis aktivitas, diambil dari course_activity (ca)
    at.type_name                         -- Jenis aktivitas (misalnya, kuis, tugas, dll.)
FROM 
    student s
JOIN 
    enrollment e ON s.stu_id = e.stu_id
JOIN 
    course c ON e.course_id = c.course_id
JOIN 
    course_activity ca ON c.course_id = ca.course_id
JOIN 
    activity_type at ON ca.type_id = at.type_id
JOIN 
    interes_type it ON ca.activity_id = it.activity_id  -- Menggunakan activity_id
WHERE
    e.grade IS NOT NULL                 -- Pastikan hanya pengguna dengan nilai yang ada
ORDER BY 
    s.stu_id, ca.activity_start_date;

"""


df = pd.read_sql(query,engine)
df


Unnamed: 0,stu_id,student_name,gender,grade,interes_name,activity_name,activity_start_date,activity_end_date,course_name,course_id,type_id,type_name
0,1,Tara Johnson,Female,80,Teamwork,Course 2 - Group,2025-04-19 13:31:10,2025-04-19 14:30:10,Course 2,2,4,Forum
1,1,Tara Johnson,Female,80,Empathy & Social,Course 2 - Group,2025-04-19 13:31:10,2025-04-19 14:30:10,Course 2,2,4,Forum
2,1,Tara Johnson,Female,79,Teamwork,Course 3 - Do,2025-04-20 04:23:10,2025-04-20 05:20:10,Course 3,3,4,Forum
3,1,Tara Johnson,Female,79,Problem Solving,Course 3 - Do,2025-04-20 04:23:10,2025-04-20 05:20:10,Course 3,3,4,Forum
4,1,Tara Johnson,Female,59,Teamwork,Course 1 - Summer,2025-04-22 23:25:54,2025-04-23 00:28:54,Course 1,1,3,Group Assignment
...,...,...,...,...,...,...,...,...,...,...,...,...
2695,100,Anna Reid,Female,62,Time Management,Course 3 - Week,2025-05-05 08:31:09,2025-05-05 09:48:09,Course 3,3,3,Group Assignment
2696,100,Anna Reid,Female,68,Leadership,Course 4 - Purpose,2025-05-05 17:42:06,2025-05-05 18:54:06,Course 4,4,2,Individual Assignment
2697,100,Anna Reid,Female,68,Purpose,Course 4 - Purpose,2025-05-05 17:42:06,2025-05-05 18:54:06,Course 4,4,2,Individual Assignment
2698,100,Anna Reid,Female,68,Problem Solving,Course 4 - Look,2025-05-09 09:00:39,2025-05-09 09:48:39,Course 4,4,4,Forum


In [56]:
myCon.close()

In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2700 entries, 0 to 2699
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   stu_id               2700 non-null   int64         
 1   student_name         2700 non-null   object        
 2   gender               2700 non-null   object        
 3   grade                2700 non-null   int64         
 4   interes_name         2700 non-null   object        
 5   activity_name        2700 non-null   object        
 6   activity_start_date  2700 non-null   datetime64[ns]
 7   activity_end_date    2700 non-null   datetime64[ns]
 8   course_name          2700 non-null   object        
 9   course_id            2700 non-null   int64         
 10  type_id              2700 non-null   int64         
 11  type_name            2700 non-null   object        
dtypes: datetime64[ns](2), int64(4), object(6)
memory usage: 253.2+ KB


In [58]:
print(df['gender'].isnull().sum())  # Cek jumlah nilai NaN di kolom gender
print(df['gender'].unique())  # Cek nilai unik pada kolom gender

0
['Female' 'Male']


In [59]:
# Mengonversi data kategorikal menjadi numerik
df['gender'] = df['gender'].map({'Male': 0, 'Female': 1})
df['interes_name'] = df['interes_name'].astype('category').cat.codes  # Label encode minat
df['type_name'] = df['type_name'].astype('category').cat.codes  # Label encode jenis aktivitas


In [60]:
# Menghitung 'days_since_activity' (berapa hari sejak aktivitas dimulai)
df['days_since_activity'] = (pd.to_datetime('now') - pd.to_datetime(df['activity_start_date'])).dt.days


In [61]:
# Pilih fitur yang relevan untuk model
X = df[['gender', 'grade', 'interes_name', 'type_name', 'days_since_activity']]
y = df['course_name']  # Menggunakan nilai (grade) sebagai target


In [62]:
# Membagi data menjadi training dan testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Cek hasil preprocessing
print(X.head())

   gender  grade  interes_name  type_name  days_since_activity
0       1     80             6          0                   36
1       1     80             1          0                   36
2       1     79             6          0                   35
3       1     79             3          0                   35
4       1     59             6          1                   33


In [63]:
# Latih model Random Forest
model_rf = RandomForestClassifier(n_estimators=100, random_state=42)
model_rf.fit(X_train, y_train)

# Prediksi menggunakan model yang dilatih
y_pred_rf = model_rf.predict(X_test)

# Latih model Gradient Boosting
model_gb = GradientBoostingClassifier(n_estimators=100, random_state=42)
model_gb.fit(X_train, y_train)

# Prediksi menggunakan model yang dilatih
y_pred_gb = model_gb.predict(X_test)

# Prediksi menggunakan model yang dilatih
y_pred_rf = model_rf.predict(X_test)

# Latih model Gradient Boosting
model_gb = GradientBoostingClassifier(n_estimators=100, random_state=42)
model_gb.fit(X_train, y_train)

# Prediksi menggunakan model yang dilatih
y_pred_gb = model_gb.predict(X_test)

In [64]:
from sklearn.metrics import classification_report

# Laporan klasifikasi untuk Random Forest
print("Classification Report untuk Random Forest:")
print(classification_report(y_test, y_pred_rf))




Classification Report untuk Random Forest:
              precision    recall  f1-score   support

    Course 1       1.00      1.00      1.00        80
    Course 2       1.00      1.00      1.00       109
    Course 3       1.00      1.00      1.00       106
    Course 4       1.00      1.00      1.00       124
    Course 5       1.00      1.00      1.00       121

    accuracy                           1.00       540
   macro avg       1.00      1.00      1.00       540
weighted avg       1.00      1.00      1.00       540



In [65]:
# Laporan klasifikasi untuk Gradient Boosting
print("Classification Report untuk Gradient Boosting:")
print(classification_report(y_test, y_pred_gb))

Classification Report untuk Gradient Boosting:
              precision    recall  f1-score   support

    Course 1       1.00      1.00      1.00        80
    Course 2       1.00      1.00      1.00       109
    Course 3       1.00      1.00      1.00       106
    Course 4       1.00      1.00      1.00       124
    Course 5       1.00      1.00      1.00       121

    accuracy                           1.00       540
   macro avg       1.00      1.00      1.00       540
weighted avg       1.00      1.00      1.00       540



In [66]:
from sklearn.metrics import matthews_corrcoef

# Hitung MCC untuk model Random Forest
mcc_rf = matthews_corrcoef(y_test, y_pred_rf)

# Hitung MCC untuk model Gradient Boosting
mcc_gb = matthews_corrcoef(y_test, y_pred_gb)

# Tampilkan hasilnya
print(f"MCC untuk Random Forest: {mcc_rf}")
print(f"MCC untuk Gradient Boosting: {mcc_gb}")


MCC untuk Random Forest: 1.0
MCC untuk Gradient Boosting: 1.0
