<a href="https://colab.research.google.com/github/sayevvv/ML_Dibimbing/blob/main/Assignment_DSF_Abdullah_Shamil_Basayev.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Arahan Assignment
0. Gunakan dataset student score
1. Lakukan proses Exploratory data analysis
2. Lakukan feature engineering :
- Check Duplicated Data
- Check Missing Value Handling
- Outlier Analysis
3. Lakukan modelling machine learning regression : gunakan minimal 2 model (linear regression, decision tree regressor, atau random forest regressor)
4. Lakukan evaluasi model
5. Berikan kesimpulan model mana yang terbaik performanya

In [None]:
# Import libraries and resources
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
df = pd.read_csv("/content/drive/MyDrive/MachineLearning/student_scores.csv")

In [None]:
# Info dataset
print("Informasi Dataset:")
print(df.info())

Informasi Dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Hours   25 non-null     float64
 1   Scores  25 non-null     int64  
dtypes: float64(1), int64(1)
memory usage: 532.0 bytes
None


In [None]:
# Statistik deskriptif
print("\nStatistik Deskriptif:")
print(df.describe())


Statistik Deskriptif:
           Hours     Scores
count  25.000000  25.000000
mean    5.012000  51.480000
std     2.525094  25.286887
min     1.100000  17.000000
25%     2.700000  30.000000
50%     4.800000  47.000000
75%     7.400000  75.000000
max     9.200000  95.000000


In [None]:
# ==============================
# 4. Feature Engineering
# ==============================

## 4.1 Check Duplicate
duplicates = df.duplicated().sum()
print(f"\nJumlah duplikasi: {duplicates}")
df = df.drop_duplicates()

## 4.2 Check Missing Value
missing_values = df.isnull().sum()
print("\nMissing Values:")
print(missing_values)

# Jika ada missing value → drop (opsional: bisa imputasi)
df = df.dropna()

## 4.3 Outlier Analysis (IQR Method)
def detect_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return data[(data[column] < lower) | (data[column] > upper)]

outliers = detect_outliers_iqr(df, "Scores")
print(f"\nJumlah Outlier pada kolom Scores: {len(outliers)}")


Jumlah duplikasi: 0

Missing Values:
Hours     0
Scores    0
dtype: int64

Jumlah Outlier pada kolom Scores: 0


In [None]:
# ==============================
# 5. Modeling
# ==============================

# X = Fitur, y = Target
X = df[['Hours']]
y = df['Scores']

# Split dataset (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardisasi (khusus model linear regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# ==============================
# Model 1: Linear Regression
# ==============================
lin_reg = LinearRegression()
lin_reg.fit(X_train_scaled, y_train)
y_pred_lr = lin_reg.predict(X_test_scaled)

# Evaluasi Linear Regression
mae_lr = mean_absolute_error(y_test, y_pred_lr)
mse_lr = mean_squared_error(y_test, y_pred_lr)
rmse_lr = np.sqrt(mse_lr)
r2_lr = r2_score(y_test, y_pred_lr)

In [None]:
# ==============================
# Model 2: Random Forest Regressor
# ==============================
rf_reg = RandomForestRegressor(n_estimators=300, random_state=42)
rf_reg.fit(X_train, y_train)
y_pred_rf = rf_reg.predict(X_test)

# Evaluasi Random Forest
mae_rf = mean_absolute_error(y_test, y_pred_rf)
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)
r2_rf = r2_score(y_test, y_pred_rf)

In [None]:
# ==============================
# 6. Hasil Evaluasi
# ==============================
results = pd.DataFrame({
    "Model": ["Linear Regression", "Random Forest Regressor"],
    "MAE": [mae_lr, mae_rf],
    "MSE": [mse_lr, mse_rf],
    "RMSE": [rmse_lr, rmse_rf],
    "R2": [r2_lr, r2_rf]
})

print("\nHasil Evaluasi Model:")
print(results.sort_values(by="R2", ascending=False))


Hasil Evaluasi Model:
                     Model       MAE        MSE      RMSE        R2
1  Random Forest Regressor  2.903333  11.931989  3.454271  0.979721
0        Linear Regression  3.920751  18.943212  4.352380  0.967806


In [None]:
# ==============================
# 7. Kesimpulan
# ==============================
if r2_rf > r2_lr:
    print(f"\nModel terbaik adalah **Random Forest** dengan R² = {r2_rf:.4f}")
else:
    print(f"\nModel terbaik adalah **Linear Regression** dengan R² = {r2_lr:.4f}")


Model terbaik adalah **Random Forest** dengan R² = 0.9797
