# ML_Project.ipynb
Proyek klasifikasi Rice Cammeo & Osmancik

## 1. Import Library

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.io import arff
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import tensorflow as tf
from tensorflow import keras


## 2. Load Dataset (Manual Download)

In [None]:
# Pastikan file 'rice.arff' sudah ada di folder data/
path = '../data/rice.arff'

data, meta = arff.loadarff(path)
df = pd.DataFrame(data)

# decode label bytes
df['Class'] = df['Class'].apply(lambda x: x.decode() if isinstance(x, bytes) else x)

df.head()

## 3. EDA

In [None]:
df.describe()

In [None]:
sns.histplot(df['Area'], kde=True)
plt.title('Distribusi Area')
plt.show()

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(df.corr(numeric_only=True), annot=True)
plt.title('Heatmap Korelasi')
plt.show()

In [None]:
sns.scatterplot(x=df['Major_Axis_Length'], y=df['Minor_Axis_Length'], hue=df['Class'])
plt.title('Scatter Major vs Minor Axis')
plt.show()

## 4. Preprocessing

In [None]:
X = df.drop('Class', axis=1)
y = df['Class']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, stratify=y, random_state=42
)


## 5. Baseline Model (Logistic Regression)

In [None]:
model_lr = LogisticRegression(max_iter=200)
model_lr.fit(X_train, y_train)
pred_lr = model_lr.predict(X_test)

print(classification_report(y_test, pred_lr))
print(confusion_matrix(y_test, pred_lr))

## 6. Advanced Model (Random Forest)

In [None]:
model_rf = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
model_rf.fit(X_train, y_train)
pred_rf = model_rf.predict(X_test)

print(classification_report(y_test, pred_rf))
print(confusion_matrix(y_test, pred_rf))

## 7. Deep Learning Model (MLP)

In [None]:
model_dl = keras.Sequential([
    keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(1, activation='sigmoid')
])

model_dl.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

history = model_dl.fit(
    X_train, y_train,
    epochs=20,
    batch_size=32,
    validation_split=0.2
)

model_dl.save('../models/model_mlp.h5')


## 8. Plot Training History

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Loss per Epoch')
plt.legend(['train','val'])
plt.show()

plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Accuracy per Epoch')
plt.legend(['train','val'])
plt.show()


## 9. Evaluasi MLP

In [None]:
pred_dl = (model_dl.predict(X_test) > 0.5).astype('int32')
print(classification_report(y_test, pred_dl))
print(confusion_matrix(y_test, pred_dl))