In [6]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

print("✅ Google Drive mounted!")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Google Drive mounted!


In [7]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("✅ Libraries imported!")

✅ Libraries imported!


In [8]:
# Load dataset
df = pd.read_csv('/content/drive/MyDrive/project_data_mining/dataset.csv', sep=';')

print(f"Dataset shape: {df.shape}")
print(f"Total rows: {df.shape[0]}")
print(f"Total columns: {df.shape[1]}")

df.head()

Dataset shape: (3000, 11)
Total rows: 3000
Total columns: 11


Unnamed: 0,Gender,Umur,Jurusan/Program Studi,Jam Belajar per Hari,Jam Tidur per Hari,IPK,Jumlah Tugas Besar per Minggu,Frekuensi Olahraga,Pemasukan Keluarga,Status Hubungan,Label
0,Laki-laki,20,Teknik Informatika,6,8,3.13,1,Jarang,Sedang,Dalam hubungan,Sehat
1,Perempuan,18,Hukum,5,6,188194444.0,1,Jarang,Rendah,Jomblo,Sehat
2,Perempuan,21,Desain Komunikasi Visual,6,4,127777778.0,1,Kadang,Sedang,Dalam hubungan,Risiko Stres
3,Laki-laki,24,Kedokteran,3,3,2.44,0,Sering,Sedang,Dalam hubungan,Sehat
4,Laki-laki,18,Hukum,4,4,3.35,5,Sering,Tinggi,Jomblo,Sehat


In [9]:
# Check data info
print("=" * 50)
print("DATA INFORMATION")
print("=" * 50)
df.info()

print("\n" + "=" * 50)
print("MISSING VALUES")
print("=" * 50)
print(df.isnull().sum())

print("\n" + "=" * 50)
print("DUPLICATES")
print("=" * 50)
print(f"Duplicates: {df.duplicated().sum()}")

DATA INFORMATION
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 11 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   Gender                         3000 non-null   object
 1   Umur                           3000 non-null   int64 
 2   Jurusan/Program Studi          3000 non-null   object
 3   Jam Belajar per Hari           3000 non-null   int64 
 4   Jam Tidur per Hari             3000 non-null   int64 
 5   IPK                            3000 non-null   object
 6   Jumlah Tugas Besar per Minggu  3000 non-null   int64 
 7   Frekuensi Olahraga             3000 non-null   object
 8   Pemasukan Keluarga             3000 non-null   object
 9   Status Hubungan                3000 non-null   object
 10  Label                          3000 non-null   object
dtypes: int64(4), object(7)
memory usage: 257.9+ KB

MISSING VALUES
Gender                           0
Umur  

In [10]:
# Statistical summary
print("STATISTICAL SUMMARY")
print("=" * 80)
df.describe()

STATISTICAL SUMMARY


Unnamed: 0,Umur,Jam Belajar per Hari,Jam Tidur per Hari,Jumlah Tugas Besar per Minggu
count,3000.0,3000.0,3000.0,3000.0
mean,21.546667,3.926333,6.028333,2.474
std,2.295702,2.031161,1.983812,1.711228
min,18.0,1.0,3.0,0.0
25%,20.0,2.0,4.0,1.0
50%,22.0,4.0,6.0,2.0
75%,24.0,6.0,8.0,4.0
max,25.0,7.0,9.0,5.0


In [11]:
# Target variable distribution
print("LABEL DISTRIBUTION")
print("=" * 50)
print(df['Label'].value_counts())

# Visualize
import plotly.express as px

# Pie chart
label_counts = df['Label'].value_counts()
fig = px.pie(
    values=label_counts.values,
    names=label_counts.index,
    color_discrete_sequence=['#38ef7d', '#f45c43'],
    hole=0.4
)
fig.update_layout(
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    font=dict(size=14)
)
fig.show()

LABEL DISTRIBUTION
Label
Sehat           1803
Risiko Stres    1197
Name: count, dtype: int64


In [12]:
# Gender distribution
gender_counts = df['Gender'].value_counts()
fig = px.pie(
    values=gender_counts.values,
    names=gender_counts.index,
    color_discrete_sequence=['#667eea', '#764ba2'],
    hole=0.4
)
fig.update_layout(
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    font=dict(size=14)
)
fig.show()

In [13]:
# Box plot numeric features by label
numeric_cols = ['Umur', 'Jam Belajar per Hari', 'Jam Tidur per Hari', 'Jumlah Tugas Besar per Minggu']

for col in numeric_cols:
    # Box plot
    fig = px.box(
        df,
        x='Label',
        y=col,
        color='Label',
        color_discrete_map={'Sehat': '#38ef7d', 'Risiko Stres': '#f45c43'}
    )
    fig.update_layout(
        paper_bgcolor='rgba(0,0,0,0)',
        plot_bgcolor='rgba(0,0,0,0)',
        font=dict(size=14),
        title=f'{col} by Label'
    )
    fig.show()

    # Histogram
    fig2 = px.histogram(
        df,
        x=col,
        color='Label',
        barmode='overlay',
        color_discrete_map={'Sehat': '#38ef7d', 'Risiko Stres': '#f45c43'},
        opacity=0.7
    )
    fig2.update_layout(
        paper_bgcolor='rgba(0,0,0,0)',
        plot_bgcolor='rgba(0,0,0,0)',
        font=dict(size=14),
        title=f'Distribution of {col}'
    )
    fig2.show()

In [14]:
# Categorical features vs label
categorical_cols = ['Jurusan/Program Studi', 'Frekuensi Olahraga', 'Pemasukan Keluarga', 'Status Hubungan', 'Gender']

for col in categorical_cols:
    # Grouped bar chart
    grouped = df.groupby([col, 'Label']).size().reset_index(name='Count')
    fig = px.bar(
        grouped,
        x=col,
        y='Count',
        color='Label',
        barmode='group',
        color_discrete_map={'Sehat': '#38ef7d', 'Risiko Stres': '#f45c43'}
    )
    fig.update_layout(
        paper_bgcolor='rgba(0,0,0,0)',
        plot_bgcolor='rgba(0,0,0,0)',
        font=dict(size=14),
        xaxis_tickangle=-45,
        title=f'{col} vs Label'
    )
    fig.show()

In [15]:
# Statistics by label
numeric_cols = ['Umur', 'Jam Belajar per Hari', 'Jam Tidur per Hari', 'Jumlah Tugas Besar per Minggu']

print("STATISTIK SEMUA DATA:")
print("=" * 80)
print(df[numeric_cols].describe())

print("\n\nSTATISTIK - SEHAT:")
print("=" * 80)
print(df[df['Label'] == 'Sehat'][numeric_cols].describe())

print("\n\nSTATISTIK - RISIKO STRES:")
print("=" * 80)
print(df[df['Label'] == 'Risiko Stres'][numeric_cols].describe())

STATISTIK SEMUA DATA:
              Umur  Jam Belajar per Hari  Jam Tidur per Hari  \
count  3000.000000           3000.000000         3000.000000   
mean     21.546667              3.926333            6.028333   
std       2.295702              2.031161            1.983812   
min      18.000000              1.000000            3.000000   
25%      20.000000              2.000000            4.000000   
50%      22.000000              4.000000            6.000000   
75%      24.000000              6.000000            8.000000   
max      25.000000              7.000000            9.000000   

       Jumlah Tugas Besar per Minggu  
count                    3000.000000  
mean                        2.474000  
std                         1.711228  
min                         0.000000  
25%                         1.000000  
50%                         2.000000  
75%                         4.000000  
max                         5.000000  


STATISTIK - SEHAT:
              Umur  Jam Belaj