# Import Package

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from jcopml.pipeline import num_pipe, cat_pipe
from jcopml.feature_importance import mean_score_decrease

# Import Data

In [None]:
df = pd.read_csv("dataset/dataset.csv")
df.head(20)

# Praprocessing

### Jumlah Baris dan Kolom

In [None]:
df.shape

### Melihat Missing Value

In [None]:
df.isnull().sum()

In [None]:
df.dropna(inplace=True)
df.isnull().sum()

### Melihat Kembali Jumlah Baris dan Kolom

In [None]:
df.shape

### Mengubah value dari kolom Cuaca

In [None]:
df["cuaca"].unique()

### Menggunakan Regex untuk mengelompokkan jenis Cuaca

In [None]:
import re

def cuaca(x):
  if (re.findall("Cerah", x)):
    return 'Cerah'
  elif (re.findall("Hujan", x)):
    return 'Hujan'
  elif (re.findall("Berawan", x)):
    return 'Berawan'
  else:
    return 'Tidak Teridentifikasi'

df["cuaca"] = df['cuaca'].apply(cuaca)
df["cuaca"].value_counts()

In [None]:
df.waktu.value_counts()

In [None]:
import re

def cuaca(x):
  if (re.findall("dini hari", x)):
    return 'Dini Hari'
  elif (re.findall("siang", x)):
    return 'Siang'
  elif (re.findall("pagi", x)):
    return 'Pagi'
  elif (re.findall("malam", x)):
    return 'Malam'
  else:
    return x

df["waktu"] = df['waktu'].apply(cuaca)
df["waktu"].value_counts()

### Drop Data

In [None]:
df[df["cuaca"] == "Tidak Teridentifikasi"].index

In [None]:
df.drop(df[df["cuaca"] == "Tidak Teridentifikasi"].index, inplace=True)
df["cuaca"].unique()

Split Data Kelembaban menjadi Kelembapan Minimal dan Maximal

In [None]:
df["kelembapan_min"] = df['kelembaban_persen'].apply(lambda x: x.split("-")[0])
df["kelembapan_min"] = [x.strip(' ') for x in df["kelembapan_min"]]

df["kelembapan_max"] = df['kelembaban_persen'].apply(lambda x: x.split("-")[1])
df["kelembapan_max"] = [x.strip(' ') for x in df["kelembapan_max"]]

df.head()

In [None]:
df["suhu_min"] = df.suhu_derajat_celcius.apply(lambda x: x.split("-")[0])
df["suhu_min"] = [x.strip(' ') for x in df["suhu_min"]]

df["suhu_max"] = df.suhu_derajat_celcius.apply(lambda x: x.split("-")[1])
df["suhu_max"] = [x.strip(' ') for x in df["suhu_max"]]

df.head()

Melihat Masing Masing Data dari Suhu

In [None]:
print('Suhu Min\n', df['suhu_min'].value_counts(), '\n')
print('Suhu Max\n', df['suhu_max'].value_counts(), '\n')

Melihat Masing Masing Data dari Kelembapan

In [None]:
print('Kelembapan Min\n', df['kelembapan_min'].value_counts(), '\n\n')
print('Kelembapan Min\n', df['kelembapan_max'].value_counts(), '\n')

# Tahap Praprocessing 2

In [None]:
df.head(50)

### Drop Kolom

In [None]:
df.drop(columns=["kelembaban_persen", "suhu_derajat_celcius", "tanggal"], inplace=True)

In [None]:
df.wilayah.value_counts()

### Mengubah Tipe Data

In [None]:
df.dtypes

In [None]:
df["kelembapan_min"] = df.kelembapan_min.astype(int)
df["kelembapan_max"] = df.kelembapan_max.astype(int)
df["suhu_min"] = df.suhu_min.astype(int)
df["suhu_max"] = df.suhu_max.astype(int)
df.dtypes

# Visualisasi Data

In [None]:
sns.pairplot(df, hue="cuaca", corner=True)

# Target Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df["cuaca"] = le.fit_transform(df['cuaca'])
df['cuaca'].value_counts()

# Membuat Kolom Rata Rata untuk Suhu dan Kelembapan

In [None]:
df['kelembapan_mean'] = (df['kelembapan_min'] + df['kelembapan_max']) / 2
df['suhu_mean'] = (df['suhu_min'] + df['suhu_max']) / 2

df.drop(columns=["kelembapan_min", "kelembapan_max", "suhu_min", 'suhu_max'], inplace=True)

# Splitting Data

In [None]:
X = df.drop(columns=["cuaca"])
y = df["cuaca"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

# Tahap Preprocessing 3

In [None]:
X_train.head()

### Dummies Encoding

In [None]:
X_train = pd.get_dummies(X_train, columns=["waktu", "wilayah"])
X_test = pd.get_dummies(X_test, columns=["waktu", "wilayah"])

In [None]:
X_train.head()

In [None]:
X_test.head()

### Normalisasi Data menggunakan Min Max Scaler

In [None]:
from sklearn.preprocessing import MinMaxScaler
numeric = ["kelembapan_mean", "suhu_mean"]

scaler = MinMaxScaler(feature_range=(0, 1))

X_train[numeric] = scaler.fit_transform(X_train[numeric])
X_test[numeric] = scaler.fit_transform(X_test[numeric])

In [None]:
X_test.head()

In [None]:
X_train.head()

# Fitting Data

### Random Forest

In [None]:
paramater = {
    'n_estimators': [100, 150, 200],
    'max_depth': [20, 50, 80],
    'max_features': [0.3, 0.6, 0.8],
    'min_samples_leaf': [1, 5, 10]
}

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report

RF = RandomForestClassifier(n_jobs=-1, random_state=42)

model = GridSearchCV(RF, paramater, cv=3, n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

print(model.best_params_)

y_pred = model.predict(X_test)
y_pred_train = model.predict(X_train)

print(confusion_matrix(y_test, y_pred), "\n")
print(classification_report(y_train, y_pred_train), "\n")
print(classification_report(y_test, y_pred))


Akurasi Training 67% <br>
Akurasi Testing 63% <br>

### Support Vector Machine

In [None]:
paramater = {
    'gamma': [1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03],
    'C': [1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]
}

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report

SVM = SVC(max_iter=1000, random_state=42)

model = GridSearchCV(SVM, paramater, cv=3, n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

print(model.best_params_)

y_pred = model.predict(X_test)
y_pred_train = model.predict(X_train)

print(confusion_matrix(y_test, y_pred), "\n")
print(classification_report(y_train, y_pred_train), "\n")
print(classification_report(y_test, y_pred))

Akurasi Training 57% <br>
Akurasi Testing 56% <br>

### MLP Classifier

In [None]:
paramater = {
    'solver': ['adam', 'lbfgs'],
    'alpha': [0.0001, 0.001, 0.01, 0.1, 1.0],
    'hidden_layer_sizes': [(9,8), (9,9), (7,9), (6,9), (7,8), (8,9)]
}

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report

MLPC = MLPClassifier(max_iter=5000, random_state=42)

model = GridSearchCV(MLPC, paramater, cv=3, n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

print(model.best_params_)

y_pred = model.predict(X_test)
y_pred_train = model.predict(X_train)

print(confusion_matrix(y_test, y_pred), "\n")
print(classification_report(y_train, y_pred_train), "\n")
print(classification_report(y_test, y_pred))

Akurasi Training 63% <br>
Akurasi Testing 62% <br>

### KNN Classifier

In [None]:
paramater = {
    'n_neighbors': [1,  3,  5,  7,  9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29],
    'weights': ['uniform', 'distance'],
    'p': [1, 1.5, 2]
}

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report

KNN = KNeighborsClassifier()

model = GridSearchCV(KNN, paramater, cv=3, n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

print(model.best_params_)

y_pred = model.predict(X_test)
y_pred_train = model.predict(X_train)

print(confusion_matrix(y_test, y_pred), "\n")
print(classification_report(y_train, y_pred_train), "\n")
print(classification_report(y_test, y_pred))

Akurasi Training 65% <br>
Akurasi Testing 61% <br>

### Decision Tree Classifier

In [None]:
paramater = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30],
    'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10],
    'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
}

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report

DTC = DecisionTreeClassifier(random_state=42)

model = GridSearchCV(DTC, paramater, cv=3, n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

print(model.best_params_)

y_pred = model.predict(X_test)
y_pred_train = model.predict(X_train)

print(confusion_matrix(y_test, y_pred), "\n")
print(classification_report(y_train, y_pred_train), "\n")
print(classification_report(y_test, y_pred))

Akurasi Training 65% <br>
Akurasi Testing 62% <br>

### XGBoost Classifier

In [None]:
from jcopml.tuning.space import Integer, Real

parameter = {
    'max_depth': Integer(low=1, high=10),
    'learning_rate': Real(low=-2, high=0, prior='log-uniform'),
    'n_estimators': Integer(low=100, high=200),
    'subsample': Real(low=0.3, high=0.8, prior='uniform'),
    'gamma': Integer(low=1, high=10),
    'colsample_bytree': Real(low=0.1, high=1, prior='uniform'),
    'reg_alpha': Real(low=-3, high=1, prior='log-uniform'),
    'reg_lambda': Real(low=-3, high=1, prior='log-uniform')
}

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import confusion_matrix, classification_report

XGB = XGBClassifier(n_jobs=-1, random_state=42, use_label_encoder=False)

model = RandomizedSearchCV(XGB, parameter, cv=3, n_iter=500, n_jobs=-1, verbose=1, random_state=42)
model.fit(X_train, y_train)

print(model.best_params_)

y_pred = model.predict(X_test)
y_pred_train = model.predict(X_train)

print(confusion_matrix(y_test, y_pred), "\n")
print(classification_report(y_train, y_pred_train), "\n")
print(classification_report(y_test, y_pred))

Akurasi Training 66% <br>
Akurasi Testing 64% <br>