In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

print("✅ Google Drive mounted!")

Mounted at /content/drive
✅ Google Drive mounted!


In [2]:
# Import libraries
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
import warnings
warnings.filterwarnings('ignore')

print("✅ Libraries imported!")

✅ Libraries imported!


In [3]:
# Load dataset
df = pd.read_csv('/content/drive/MyDrive/project_data_mining/dataset.csv', sep=';')

print(f"Dataset shape: {df.shape}")
df.head()

Dataset shape: (3000, 11)


Unnamed: 0,Gender,Umur,Jurusan/Program Studi,Jam Belajar per Hari,Jam Tidur per Hari,IPK,Jumlah Tugas Besar per Minggu,Frekuensi Olahraga,Pemasukan Keluarga,Status Hubungan,Label
0,Laki-laki,20,Teknik Informatika,6,8,3.13,1,Jarang,Sedang,Dalam hubungan,Sehat
1,Perempuan,18,Hukum,5,6,188194444.0,1,Jarang,Rendah,Jomblo,Sehat
2,Perempuan,21,Desain Komunikasi Visual,6,4,127777778.0,1,Kadang,Sedang,Dalam hubungan,Risiko Stres
3,Laki-laki,24,Kedokteran,3,3,2.44,0,Sering,Sedang,Dalam hubungan,Sehat
4,Laki-laki,18,Hukum,4,4,3.35,5,Sering,Tinggi,Jomblo,Sehat


In [5]:
# Clean IPK function
def clean_ipk(value):
    if pd.isna(value):
        return np.nan

    val_str = str(value).strip()
    val_str = val_str.replace(',', '.')

    try:
        val_float = float(val_str)
        if val_float < 1:
            val_float = val_float * 20
            if val_float > 4:
                val_float = 4.0
        return val_float
    except:
        return np.nan

# Apply cleaning
df['IPK'] = df['IPK'].apply(clean_ipk)

print("✅ IPK cleaned!")
print(df['IPK'].describe())

✅ IPK cleaned!
count    3000.000000
mean        2.917874
std         0.545863
min         2.000000
25%         2.450000
50%         3.013889
75%         3.402778
max         4.000000
Name: IPK, dtype: float64


In [6]:
# Convert numeric columns to proper numeric type
numeric_features = ['Jam Belajar per Hari', 'Jam Tidur per Hari', 'IPK', 'Jumlah Tugas Besar per Minggu']
df[numeric_features] = df[numeric_features].apply(pd.to_numeric, errors='coerce')

print("✅ Numeric columns converted!")

✅ Numeric columns converted!


In [7]:
# Normalize numeric features (Z-score normalization)
df[numeric_features] = (df[numeric_features] - df[numeric_features].mean()) / df[numeric_features].std()

print("✅ Features normalized!")
print("\nNormalized data sample:")
print(df[numeric_features].head())

✅ Features normalized!

Normalized data sample:
   Jam Belajar per Hari  Jam Tidur per Hari       IPK  \
0              1.020927            0.993878  0.388607   
1              0.528598           -0.014282  1.549867   
2              1.020927           -1.022442 -0.663753   
3             -0.456061           -1.526523 -0.875446   
4              0.036268           -1.022442  0.791638   

   Jumlah Tugas Besar per Minggu  
0                      -0.861370  
1                      -0.861370  
2                      -0.861370  
3                      -1.445746  
4                       1.476133  


In [8]:
# Define features
numeric_features_all = ['Umur', 'Jam Belajar per Hari', 'Jam Tidur per Hari', 'IPK', 'Jumlah Tugas Besar per Minggu']
categorical_features = ['Gender', 'Jurusan/Program Studi', 'Frekuensi Olahraga', 'Pemasukan Keluarga', 'Status Hubungan']

# Prepare data
X = df.drop('Label', axis=1)
y = df['Label']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Train set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

Train set: (2400, 10)
Test set: (600, 10)


In [9]:
# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numeric_features_all),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'
)

print("✅ Preprocessor created!")

✅ Preprocessor created!


In [10]:
# Create pipeline
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(
        n_estimators=200,
        max_depth=4,
        random_state=42,
        n_jobs=-1
    ))
])

# Train model
print("Training model...")
model.fit(X_train, y_train)
print("✅ Model trained!")

Training model...
✅ Model trained!


In [11]:
# Evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
cm = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy*100:.2f}%")
print(f"F1-Score: {f1*100:.2f}%")

Accuracy: 93.33%
F1-Score: 93.20%


In [12]:
# Classification report
print("CLASSIFICATION REPORT")
print("=" * 80)
print(classification_report(y_test, y_pred))

CLASSIFICATION REPORT
              precision    recall  f1-score   support

Risiko Stres       0.99      0.83      0.91       230
       Sehat       0.91      0.99      0.95       370

    accuracy                           0.93       600
   macro avg       0.95      0.91      0.93       600
weighted avg       0.94      0.93      0.93       600



In [13]:
# Confusion matrix heatmap
labels = ['Risiko Stres', 'Sehat']

fig = px.imshow(
    cm,
    labels=dict(x="Prediksi", y="Aktual", color="Jumlah"),
    x=labels,
    y=labels,
    color_continuous_scale='Purples',
    text_auto=True
)
fig.update_layout(
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    font=dict(size=14)
)
fig.show()

In [14]:
# Gauge chart
fig = go.Figure(go.Indicator(
    mode = "gauge+number",
    value = accuracy * 100,
    domain = {'x': [0, 1], 'y': [0, 1]},
    title = {'text': "Akurasi (%)", 'font': {'size': 24}},
    gauge = {
        'axis': {'range': [None, 100], 'tickwidth': 1},
        'bar': {'color': "#667eea"},
        'bgcolor': "white",
        'borderwidth': 2,
        'bordercolor': "gray",
        'steps': [
            {'range': [0, 50], 'color': '#f45c43'},
            {'range': [50, 75], 'color': '#ffd93d'},
            {'range': [75, 100], 'color': '#38ef7d'}
        ],
        'threshold': {
            'line': {'color': "black", 'width': 4},
            'thickness': 0.75,
            'value': accuracy * 100
        }
    }
))
fig.update_layout(
    paper_bgcolor='rgba(0,0,0,0)',
    font={'color': "white", 'family': "Arial"}
)
fig.show()

print(f"F1-Score (Weighted): {f1*100:.1f}%")

F1-Score (Weighted): 93.2%


In [15]:
# Store preprocessing stats
df_raw = pd.read_csv('/content/drive/MyDrive/project_data_mining/dataset.csv', sep=';')  # Load raw lagi
df_raw['IPK'] = df_raw['IPK'].apply(clean_ipk)
numeric_cols = ['Jam Belajar per Hari', 'Jam Tidur per Hari', 'IPK', 'Jumlah Tugas Besar per Minggu']
df_raw[numeric_cols] = df_raw[numeric_cols].apply(pd.to_numeric, errors='coerce')

stats = {
    'mean': df_raw[numeric_cols].mean().to_dict(),
    'std': df_raw[numeric_cols].std().to_dict()
}

print("Preprocessing Stats:")
print(stats)

Preprocessing Stats:
{'mean': {'Jam Belajar per Hari': 3.9263333333333335, 'Jam Tidur per Hari': 6.028333333333333, 'IPK': 2.917873703666667, 'Jumlah Tugas Besar per Minggu': 2.474}, 'std': {'Jam Belajar per Hari': 2.0311610211699995, 'Jam Tidur per Hari': 1.9838117497825065, 'IPK': 0.5458631745063073, 'Jumlah Tugas Besar per Minggu': 1.7112276587389927}}
