# **Prediksi Nilai Akademik Mahasiswa Menggunakan Algoritma K-Nearest Neighbors (KNN) Berdasarkan Pola Perilaku Belajar**

## Read the Data


In [None]:
import os
import gdown

# Folder
base_dir = '/content/ai/'
os.makedirs(base_dir, exist_ok=True)

# ID file Google Drive
ID = "17gsRCAMmPSHFrP29rn8_WTo1qTQVQc4-"
import_url = f'https://drive.google.com/uc?id={ID}'

# Download file CSV
output_path = os.path.join(base_dir, 'student_performance_large_dataset.csv')
gdown.download(import_url, output_path, quiet=False)

print(f"File downloaded to: {output_path}")

In [None]:
import pandas as pd

#Membaca data yang telah di download
df = pd.read_csv('/content/ai/student_performance_large_dataset.csv')
df

## Explore the Data




### Statistic Descriptive

In [None]:
df.describe().transpose()

### Check unique values

In [None]:
import pandas as pd

# Buat list untuk simpan data kolom dan nilai uniknya
data = []

for col in df.columns:
    unique_vals = df[col].unique()
    # Gabungkan nilai unik jadi string, supaya bisa ditampilkan dalam 1 cell
    unique_str = ', '.join(map(str, unique_vals))
    data.append({'Column': col, 'Unique_Values': unique_str})

# Buat DataFrame dari list tersebut
unique_df = pd.DataFrame(data)
unique_df

## Cleaning Data

### Cek null

In [None]:
df.isnull().sum()

### Membersihkan Nilai Kategorikal yang Tidak Diinginkan

In [None]:
# Menghapus baris dengan Gender 'Other'
print("Nilai unik di Gender sebelum:", df['Gender'].unique())
df = df[df['Gender'] != 'Other']
print("Nilai unik di Gender sesudah:", df['Gender'].unique())

# Menghapus kolom ‘Student_ID’ dari dataframe df.
df.drop('Student_ID', axis=1, inplace=True)


In [None]:
# menghitung jumlah kemunculan setiap nilai unik [label] pada kolom ‘Final_Grade’ dalam dataframe df.
df['Final_Grade'].value_counts()

### Check Data Type

In [None]:
df.info()

## Preprocess the Data

In [None]:
def preprocess(df):
    df = df.copy()

    # --- Encoding kategori ---
    df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0})
    df['Participation_in_Discussions'] = df['Participation_in_Discussions'].map({'Yes': 1, 'No': 0})
    df['Use_of_Educational_Tech'] = df['Use_of_Educational_Tech'].map({'Yes': 1, 'No': 0})
    df['Self_Reported_Stress_Level'] = df['Self_Reported_Stress_Level'].map({'Low': 0, 'Medium': 1, 'High': 2})
    df['Preferred_Learning_Style'] = df['Preferred_Learning_Style'].map({
        'Kinesthetic': 0,
        'Reading/Writing': 1,
        'Auditory': 2,
        'Visual': 3
    })

    # Normalization
    # --- Min-Max Scaling untuk kolom numerik ---
    numerical_cols = [
        'Age', 'Study_Hours_per_Week', 'Online_Courses_Completed',
        'Assignment_Completion_Rate (%)', 'Exam_Score (%)',
        'Attendance_Rate (%)', 'Time_Spent_on_Social_Media (hours/week)',
        'Sleep_Hours_per_Night'
    ]

    # Normalisasi
    for col in numerical_cols:
        if col in df.columns:
            min_val = df[col].min()
            max_val = df[col].max()
            if max_val != min_val:
                df[col] = (df[col] - min_val) / (max_val - min_val)

    # --- Drop baris yang masih mengandung NaN hasil encoding saja (tidak semua kolom) ---
    encoded_cols = ['Gender', 'Participation_in_Discussions', 'Use_of_Educational_Tech',
                    'Self_Reported_Stress_Level', 'Preferred_Learning_Style']
    df.dropna(subset=encoded_cols, inplace=True)

    return df

In [None]:
df_processed = preprocess(df)
print(df_processed.head())
print(df_processed.shape)


In [None]:
df_processed.describe().transpose()

### Correlation

In [None]:
correlation = df_processed.corr()
correlation['Final_Grade'].sort_values(ascending=False)

## Split the Data

In [None]:
from sklearn.model_selection import train_test_split

# Fitur (X) - semua kecuali Final_Grade
X = df.drop(columns=['Final_Grade'])

# Target (y)
y = df['Final_Grade']

# Split data 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Size training set:", X_train.shape)
print("Size test set:", X_test.shape)

In [None]:
# Menampilkan data latih yang telah ditentukan pada proses sebelumnya
X_train

In [None]:
# Menampilkan data uji yang telah ditentukan pada proses sebelumnya
X_test

## Pemodelan

### Algoritma KNN

✅ Langkah-langkah KNN Manual:
1. Hitung jarak antara data uji dan semua data latih (pakai Euclidean distance)

2. Ambil k tetangga terdekat

3. Voting mayoritas dari label tetangga

4. Prediksi

In [None]:
from collections import Counter
import numpy as np

# Fungsi menghitung Euclidean distance
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))

# KNN Manual
def knn_predict(X_train, y_train, X_test, k=5):
    predictions = []
    for test_point in X_test:
        distances = []
        for i, train_point in enumerate(X_train):
            dist = euclidean_distance(test_point, train_point)
            distances.append((dist, y_train[i]))

        # Urutkan berdasarkan jarak
        distances.sort(key=lambda x: x[0])

        # Ambil k tetangga terdekat
        k_nearest = [label for _, label in distances[:k]]

        # Voting mayoritas
        most_common = Counter(k_nearest).most_common(1)[0][0]
        predictions.append(most_common)

    return np.array(predictions)


### Train the Model

### Test the Model

## -------------------------------------

## Training Data KNN

In [None]:
# Melatih algoritma dengan data train
import numpy as np

class KNN:
    def __init__(self, k=3):
        # # Inisialisasi K: jumlah tetangga terdekat yang digunakan untuk voting
        self.k = k

    def fit(self, X, y):
        # Simpan data training
        # X: fitur dari data latih, dalam format array 2D
        # y: label dari data latih
        self.X_train = np.array(X)
        self.y_train = np.array(y)

    def predict(self, X):
        # Prediksi label untuk data X (bisa 1 atau lebih)
        predictions = []

        # Iterasi untuk setiap data uji
        for x in X:
            # 1. Hitung jarak Euclidean dari titik x ke semua data training
            distances = np.sqrt(np.sum((self.X_train - x) ** 2, axis=1))

            # 2. Ambil indeks dari k tetangga terdekat
            k_indices = np.argsort(distances)[:self.k]

            # 3. Ambil label dari tetangga terdekat tersebut
            k_labels = self.y_train[k_indices]

            # 4. Majority voting: ambil label yang paling banyak muncul
            prediction = np.argmax(np.bincount(k_labels))

            # Simpan hasil prediksi
            predictions.append(prediction)
        # Kembalikan prediksi sebagai array
        return np.array(predictions)

In [None]:
# Evaluasi Algoritma Data Train
knn = KNN(k=5)
knn.fit(X_train.values, y_train.values)
y_pred_train = knn.predict(X_train.values)
print(y_pred_train)

## Testing Data

In [None]:
# Menguji Algoritma dengan Data Test

In [None]:
# Evaluasi Algoritma Data Test
y_pred_test = knn.predict(X_test.values)

### Evaluasi Model (Confusion Matrix + Metrik)

In [None]:
import numpy as np
import pandas as pd

# ================================================================
# Langkah 1: Latih Model dan Buat Prediksi
# Pastikan kelas KNN dan variabel X_train, y_train, dll. sudah ada
# ================================================================

# Inisialisasi dan latih model KNN kustom Anda
# Anda bisa mengganti k=5 dengan nilai k optimal yang Anda temukan
knn = KNN(k=5)
knn.fit(X_train.values, y_train.values)

# Membuat prediksi pada data train
y_train_pred = knn.predict(X_train.values)

# Membuat prediksi pada data test
y_test_pred = knn.predict(X_test.values)


# ================================================================
# Langkah 2: Fungsi evaluasi manual
# ================================================================
def evaluate_manual(y_true, y_pred, label=""):
    """
    Fungsi untuk menghitung dan menampilkan metrik evaluasi secara manual
    untuk masalah klasifikasi multikelas.
    """
    print(f"\n📊 HASIL EVALUASI MANUAL PADA DATA {label.upper()}")
    print("========================================")

    # Menghitung akurasi keseluruhan (berlaku untuk multikelas)
    overall_accuracy = np.sum(y_true == y_pred) / len(y_true)
    print(f"Akurasi Keseluruhan: {overall_accuracy:.4f}\n")

    # Menghitung metrik untuk setiap kelas
    classes = np.unique(y_true)
    for cls in classes:
        print(f"--- Metrik untuk Kelas: '{cls}' ---")

        # Anggap kelas saat ini sebagai 'Positif' dan lainnya sebagai 'Negatif'
        TP = np.sum((y_pred == cls) & (y_true == cls))
        TN = np.sum((y_pred != cls) & (y_true != cls))
        FP = np.sum((y_pred == cls) & (y_true != cls))
        FN = np.sum((y_pred != cls) & (y_true == cls))

        # Kalkulasi metrik untuk kelas ini
        sensitivity = TP / (TP + FN) if (TP + FN) > 0 else 0
        specificity = TN / (TN + FP) if (TN + FP) > 0 else 0
        precision = TP / (TP + FP) if (TP + FP) > 0 else 0
        f_score = (2 * precision * sensitivity) / (precision + sensitivity) if (precision + sensitivity) > 0 else 0

        print(f"  - Presisi      : {precision:.4f}")
        print(f"  - Sensitivitas : {sensitivity:.4f} (Recall)")
        print(f"  - Spesifisitas : {specificity:.4f}")
        print(f"  - F1-Score     : {f_score:.4f}\n")

# ================================================================
# Langkah 3: Panggilan Evaluasi untuk Train dan Test
# ================================================================
# Mengubah y_train dan y_test menjadi numpy array agar mudah dioperasikan
y_train_np = np.array(y_train)
y_test_np = np.array(y_test)

evaluate_manual(y_train_np, y_train_pred, label="TRAIN")
evaluate_manual(y_test_np, y_test_pred, label="TEST")

### Hyperparameter Tuning

In [None]:
errors = []
k_range = list(range(1, 50))

for k in k_range:
    model = KNN(k)
    model.fit(X_train.values, y_train.values)
    preds = model.predict(X_test.values)
    error = np.mean(preds != y_test.values)
    errors.append(error)

# Menampilkan grafik error rate vs nilai k untuk mencari nilai k terbaik.
import matplotlib.pyplot as plt
plt.plot(k_range, errors)
plt.xlabel("k")
plt.ylabel("Error Rate")
plt.title("Tuning Nilai K (KNN)")
plt.grid()
plt.show()

## Membuat Prediksi (Single Sample Test)

In [None]:
# mendemonstrasikan model ke satu input data saja, sesuai poin "Demonstration Single Data Testing".
sample = np.array(X_test.iloc[0])
result = knn.predict([sample])
print("Prediksi:", result[0], "| Real:", y_test.iloc[0])

## GitHub

In [None]:
import requests
from base64 import b64encode
import os
from google.colab import drive

# 1. Hubungkan Google Drive
drive.mount('/content/drive')

# --- Konfigurasi Anda ---
from google.colab import userdata
token = userdata.get('GITHUB_TOKEN')
username = "rahmahaisyah"
repo = "student-grade-predictions"
path = "prediction_KNN.ipynb"

# 2. Path file yang sudah dikoreksi menunjuk ke Google Drive
local_file = '/content/drive/My Drive/Colab Notebooks/prediction_KNN.ipynb'


# Lanjutan skrip Anda (tidak perlu diubah)
if not os.path.isfile(local_file):
    raise FileNotFoundError(f"File tidak ditemukan: {local_file}")

# Dapatkan isi file
with open(local_file, "rb") as f:
    content = b64encode(f.read()).decode("utf-8")

# Cek apakah file sudah ada
url = f"https://api.github.com/repos/{username}/{repo}/contents/{path}"
headers = {
    "Authorization": f"Bearer {token}",
    "Accept": "application/vnd.github.v3+json",
}

response = requests.get(url, headers=headers)
if response.status_code == 200:
    sha = response.json()["sha"]
else:
    sha = None

# Buat commit
data = {
    "message": "Upload from Colab",
    "content": content,
    "branch": "main"
}
if sha:
    data["sha"] = sha

res = requests.put(url, headers=headers, json=data)

if res.status_code in [200, 201]:
    print("✅ Berhasil diupload ke GitHub!")
else:
    print("❌ Gagal upload:", res.json())