In [None]:
# 1. Business Understanding

In [None]:
# 2. Data Understanding

In [None]:
# Import library untuk manipulasi data
import pandas as pd  # Untuk pengolahan data tabular
import numpy as np   # Untuk komputasi numerik
import matplotlib.pyplot as plt  # Untuk visualisasi data
import seaborn as sns  # Untuk visualisasi data statistik
from skimpy import skim # Import library skimpy untuk mendapatkan ringkasan statistik data yang lebih detail dan mudah dibaca
from summarytools import dfSummary # Import library summarytools untuk membuat ringkasan data yang komprehensif termasuk statistik, distribusi, dan missing values

# Import library untuk preprocessing data
from sklearn.model_selection import train_test_split  # Untuk membagi data train dan test
from sklearn.preprocessing import LabelEncoder  # Untuk encoding label kategorikal
from imblearn.over_sampling import SMOTE  # Untuk menangani data tidak seimbang

# Import library untuk model machine learning
from sklearn.tree import DecisionTreeClassifier  # Model pohon keputusan
from sklearn.ensemble import RandomForestClassifier  # Model random forest
from sklearn.naive_bayes import GaussianNB  # Model naive bayes

# Import library untuk evaluasi model
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix  # Metrik evaluasi
from sklearn.metrics import roc_curve, auc  # Untuk kurva ROC dan nilai AUC

# Mengatur matplotlib untuk menampilkan plot di notebook
%matplotlib inline

In [None]:
# Memuat dataset
df = pd.read_csv('data_apartment.csv')  

In [None]:
dfSummary(df)

In [None]:
# Menampilkan semua baris
skim(df)

In [None]:
# Menampilkan semua baris
df

In [None]:
# Melihat dimensi data
print("Dimensi data:", df.shape)

In [None]:
# Menampilkan beberapa baris pertama data
print("Baris pertama data:")
df.head()


In [None]:
# Menampilkan beberapa baris terakhir data
print("Baris terakhir data:")
df.tail()

In [None]:
# Mengecek tipe data dari setiap kolom
print("Tipe data setiap kolom:")
df.dtypes

In [None]:
# Menghitung statistik deskriptif untuk kolom numerik
print("Statistik deskriptif untuk kolom numerik:")
df.describe()

In [None]:
# 3. Data Preparation

In [None]:
# Mengecek nilai yang hilang (missing values) di setiap kolom
print("Nilai yang hilang di setiap kolom:")
df.isnull().sum()

In [None]:
# # Langkah 1: Menangani Missing Values
# # Mengisi nilai yang hilang dengan mean untuk kolom numerik dan mode untuk kolom kategorikal
# for column in df.select_dtypes(include=[np.number]).columns:
#     df[column].fillna(df[column].mean(), inplace=True)

# for column in df.select_dtypes(include=[object]).columns:
#     df[column].fillna(df[column].mode()[0], inplace=True)

In [None]:
# Langkah 2.1: Feature Selection yaitu Menghapus kolom tertentu
drop_column = ['No', 'Project', 'Category', 'Detail', 'Facing', 'Cancel Type', 'Deal Closer Team', 'Akad KPA/R Date', 'PostCode (ID)', 'Address (Cor)', 'PostCode (Cor)', 'Customer Job Status', 'Customer Education', 'Customer Income', 'Customer Salary Method', 'Customer Nation']
df = df.drop(columns=drop_column)

In [None]:
# Langkah 2.2: Feature Selection yaitu Menghapus kolom tertentu
drop_column1 = ['Book Date', 'Month', 'Country', 'Province', 'City', 'Unique ID', 'Block/Floor', 'Akad Month', 'PS Code', 'Net Area', 'Disc', 'Add Disc', 'Booking Disc', 'Adjustment Price', 'Status Checklist Document', 'Country (Cor)']
df = df.drop(columns=drop_column1)

In [None]:
# Langkah 2.3: Feature Selection yaitu Menghapus kolom tertentu
drop_column2 = ['Deal Closer', 'Sales Referral', 'Booking Term', 'Booking Term Price Scheme', 'Unit Name', 'Term Payment', 'Semi Gross Area', 'Area']
df = df.drop(columns=drop_column2)

In [None]:
dfSummary(df)

In [None]:
# Visualisasi bar chart untuk kolom dengan jumlah unique value yang tidak terlalu banyak (misal <= 10)
max_unique = 10  # batas maksimal unique value agar bar chart tetap informatif

for column in df.columns:
    if df[column].nunique() <= max_unique:
        plt.figure(figsize=(8, 6))
        sns.countplot(x=column, data=df, hue=column, palette="colorblind", legend=False)
        plt.title(f'Distribusi {column}')
        plt.xlabel(column)
        plt.ylabel('Jumlah')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

In [None]:
# Membuat bar chart dengan perbandingan terhadap tujuan pembelian
max_unique = 10

def plot_stacked_bar_chart(data, column):
    plt.figure(figsize=(10, 6))
    sns.countplot(x=column, hue='Sales Purpose', data=data)
    plt.title(f'Distribusi {column} berdasarkan Tujuan Pembelian')
    plt.xlabel(column)
    plt.ylabel('Jumlah')
    plt.xticks(rotation=45)
    plt.legend(title='Sales Purpose', loc='upper right')
    plt.tight_layout()
    plt.show()

# Membuat bar chart bertumpuk untuk setiap atribut (kecuali 'Sales Purpose')
for column in df.columns:
    if column != 'Sales Purpose' and df[column].nunique() <= max_unique:
        plot_stacked_bar_chart(df, column)

In [None]:
# Histogram untuk Distribusi Usia Pelanggan
# Menghitung usia berdasarkan Customer Birth Date
from datetime import datetime

# Convert 'Customer Birth Date' to datetime
df['Customer Birth Date'] = pd.to_datetime(df['Customer Birth Date'], errors='coerce')
df['Age'] = df['Customer Birth Date'].apply(lambda x: (datetime.now() - x).days // 365 if pd.notnull(x) else None)

plt.figure(figsize=(10, 6))
sns.histplot(df['Age'].dropna(), kde=True, color='skyblue')
plt.title('Distribusi Usia Pelanggan', fontsize=16)
plt.xlabel('Usia', fontsize=12)
plt.ylabel('Jumlah Pelanggan', fontsize=12)
plt.show()


In [None]:
# # Visualisasi heatmap korelasi antar fitur numerik
# plt.figure(figsize=(16, 10))
# corr = df.corr()
# sns.heatmap(corr, annot=True, fmt=".2f", cmap='coolwarm', linewidths=0.5)
# plt.title('Heatmap Korelasi Antar Fitur Numerik', fontsize=16)
# plt.show()


In [None]:
# Menampilkan distribusi beberapa fitur kunci
fitur_kunci = ['Cluster', 'Customer Grade', 'Sales Event', 'Year']
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
for i, fitur in enumerate(fitur_kunci):
    sns.countplot(x=fitur, hue='Sales Purpose', data=df, ax=axes[i//2, i%2])
    axes[i//2, i%2].set_title(f'Distribusi {fitur}')
plt.tight_layout()
plt.show()

In [None]:
# Menampilkan pie chart untuk kolom Sales Purpose
explode = (0.2, 0)
plt.figure(figsize=(8, 6))
df['Sales Purpose'].value_counts().plot(kind='pie', autopct='%1.1f%%', startangle=90, colors=['#007BFF', '#FFA500'], shadow=True, explode=(0.1, 0))
plt.title('Distribusi Sales Purpose')
plt.ylabel('')
plt.show()

In [None]:
# Melakukan encoding pada variabel kategorikal khusus pada kolom Customer Type, Customer Sex, Customer Grade, Sales Purpose

kolom_kategorikal = ['Booking Status', 'Sales Event', 'Cluster', 'Zoning', 'View', 'Customer Type', 'Customer Marital', 'Customer Sex', 'Customer Grade', 'Sales Purpose', 'Status KPR']
label_encoder = LabelEncoder()
for kolom in kolom_kategorikal:
    if kolom in df.columns:
        df[kolom] = label_encoder.fit_transform(df[kolom])


In [None]:
# Mengganti nama kolom menggunakan fungsi pandas .rename()
df = df.rename(columns={'City (Cor)': 'City', 'Address (ID)': 'Address', 'Province (Cor)': 'Province'})

In [None]:
# Memindahkan kolom 'Age' agar berada di samping kolom 'Customer Birth Date'
kolom = list(df.columns)
if 'Age' in kolom and 'Customer Birth Date' in kolom:
    kolom.remove('Age')
    idx = kolom.index('Customer Birth Date') + 1
    kolom.insert(idx, 'Age')
    df = df[kolom]


In [None]:
dfSummary(df)

In [None]:
# Menampilkan peta panas korelasi untuk kolom numerik
kolom_numerik = df.select_dtypes(include=['int64', 'float64']).columns
plt.figure(figsize=(18, 10))
sns.heatmap(df[kolom_numerik].corr(), annot=True, cmap='coolwarm')
plt.title('Heatmap Correlation')
plt.show()

In [None]:
# Memisahkan fitur dan Target
# Target: kolom 'Sales Purpose'
# Fitur: kolom 'cluster', 'customer grade', 'sales event', dan 'year'
# Kode berikut digunakan untuk menentukan fitur (variabel input) dan target (variabel output) yang akan digunakan dalam pemodelan machine learning.
# 'fitur' adalah daftar nama kolom yang dipilih sebagai fitur, yaitu 'Cluster', 'Customer Grade', 'Sales Event', dan 'Year'.
# X merupakan DataFrame yang berisi data dari kolom-kolom fitur tersebut.
# y adalah variabel target yang diambil dari kolom 'Sales Purpose' pada DataFrame df.

fitur = ['Sales Event', 'Cluster', 'Zoning', 'View', 'Status KPR', 'Customer Type', 'Customer Sex', 'Customer Marital', 'Customer Grade', 'Sales Event', 'Year']  # Mendefinisikan fitur yang akan digunakan
X = df[fitur]  # Mengambil data fitur dari DataFrame
y = df['Sales Purpose']  # Mengambil data target dari kolom 'Sales Purpose'


In [None]:
# Langkah 6: Data Balancing Menggunakan SMOTE
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X, y)


In [None]:
# Langkah 7: Membagi Data Menjadi Set Pelatihan dan Pengujian dengan rasio 80%-20%
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42, stratify=y_balanced)

In [None]:
# Menampilkan ringkasan dari pembagian data setelah SMOTE
print("Dimensi X_train:", X_train.shape)
print("Dimensi y_train:", y_train.shape)
print("Dimensi X_test:", X_test.shape)
print("Dimensi y_test:", y_test.shape)

print("Distribusi kelas pada y_train:")
print(y_train.value_counts())

print("Fitur yang digunakan:", X_train.columns)

In [None]:
# 4. Modelling

In [None]:
# Decision Tree
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)

In [None]:
# Random Forest
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

In [None]:
# Naive Bayes
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)

In [None]:
# 5. Evaluation

In [None]:
# Fungsi untuk plot confusion matrix
def plot_confusion_matrix(y_true, y_pred, model_name):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix - {model_name}')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()

In [None]:
# Fungsi untuk plot ROC curve
def plot_roc_curve(y_true, y_pred_proba, model_name):
    fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'Receiver Operating Characteristic - {model_name}')
    plt.legend(loc="lower right")
    plt.show()


In [None]:
# Evaluasi dan visualisasi untuk setiap model
models = {
    'Decision Tree': dt_model,
    'Random Forest': rf_model,
    'Naive Bayes': nb_model
}

In [None]:
for model_name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"\nModel: {model_name}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    
    # Plot confusion matrix
    plot_confusion_matrix(y_test, y_pred, model_name)
    
    # Plot ROC curve if model supports probability prediction
    if hasattr(model, "predict_proba"):
        y_pred_proba = model.predict_proba(X_test)[:, 1]
        plot_roc_curve(y_test, y_pred_proba, model_name)
    # elif model_name == 'SVM':  # SVM with linear kernel uses decision_function
    #     y_pred_proba = model.decision_function(X_test)
    #     plot_roc_curve(y_test, y_pred_proba, model_name)

In [None]:
# Evaluate Decision Tree
accuracy_dt = accuracy_score(y_test, y_pred_dt)
classification_report_dt = classification_report(y_test, y_pred_dt)
confusion_matrix_dt = confusion_matrix(y_test, y_pred_dt)

print("Decision Tree Accuracy:", accuracy_dt)
print("Decision Tree Classification Report:\n", classification_report_dt)
print("Decision Tree Confusion Matrix:\n", confusion_matrix_dt)

# Evaluate Random Forest
accuracy_rf = accuracy_score(y_test, y_pred_rf)
classification_report_rf = classification_report(y_test, y_pred_rf)
confusion_matrix_rf = confusion_matrix(y_test, y_pred_rf)

print("Random Forest Accuracy:", accuracy_rf)
print("Random Forest Classification Report:\n", classification_report_rf)
print("Random Forest Confusion Matrix:\n", confusion_matrix_rf)

# Evaluate Naive Bayes
accuracy_nb = accuracy_score(y_test, y_pred_nb)
classification_report_nb = classification_report(y_test, y_pred_nb)
confusion_matrix_nb = confusion_matrix(y_test, y_pred_nb)

print("Naive Bayes Accuracy:", accuracy_nb)
print("Naive Bayes Classification Report:\n", classification_report_nb)
print("Naive Bayes Confusion Matrix:\n", confusion_matrix_nb)

In [None]:
# Compile all results into a dictionary for easy comparison
model_performance = {
    'Decision Tree': {
        'Accuracy': accuracy_dt,
        'Classification Report': classification_report_dt,
        'Confusion Matrix': confusion_matrix_dt
    },
    'Random Forest': {
        'Accuracy': accuracy_rf,
        'Classification Report': classification_report_rf,
        'Confusion Matrix': confusion_matrix_rf
    },
    'Naive Bayes': {
        'Accuracy': accuracy_nb,
        'Classification Report': classification_report_nb,
        'Confusion Matrix': confusion_matrix_nb
    }
}


# Display model performance
for model_name, performance in model_performance.items():
    print(f"\nModel: {model_name}")
    print(f"Accuracy: {performance['Accuracy']}")
    print("Classification Report:\n", performance['Classification Report'])
    print("Confusion Matrix:\n", performance['Confusion Matrix'])

In [None]:
# 6. Deployment

In [None]:
df = df.to_excel('simpan_data.xlsx')

In [None]:
import joblib

# Menyimpan model
joblib.dump(dt_model, 'decision_tree_model.pkl')
joblib.dump(rf_model, 'random_forest_model.pkl')
joblib.dump(nb_model, 'naive_bayes_model.pkl')

In [None]:
# Memuat model
loaded_dt_model = joblib.load('decision_tree_model.pkl')
loaded_rf_model = joblib.load('random_forest_model.pkl')
loaded_nb_model = joblib.load('naive_bayes_model.pkl')

In [None]:
# Data baru untuk prediksi (sesuaikan dengan data Anda)
new_data = X_test.iloc[0].to_frame().T  # Mengambil satu baris data sebagai contoh

# Prediksi menggunakan model yang telah dimuat
prediction_dt = loaded_dt_model.predict(new_data)
prediction_rf = loaded_rf_model.predict(new_data)
prediction_nb = loaded_nb_model.predict(new_data)

# Menampilkan hasil prediksi
print(f"Decision Tree Prediction: {'Positif' if prediction_dt[0] == 1 else 'Negatif'}")
print(f"Random Forest Prediction: {'Positif' if prediction_rf[0] == 1 else 'Negatif'}")
print(f"Naive Bayes Prediction: {'Positif' if prediction_nb[0] == 1 else 'Negatif'}")