In [1]:
import os
print("File ada?", os.path.exists('lead_data.csv'))  # Harus True

File ada? True


In [2]:
# Import libraries (install via pip jika perlu: scikit-learn, pandas, matplotlib)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import RFE  # Kayak di paper
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
import matplotlib.pyplot as plt

# Step 1: Load Dataset KAGGLE (ADJUST KOLOM)
df = pd.read_csv('lead_data.csv')
print("Shape:", df.shape)
print("\nKolom:", df.columns.tolist())
print("\nFirst 5 rows:\n", df.head())

# Cek target column (biasanya 'Converted' atau 'Purchase')
print("\nTarget distribution:\n", df['Converted'].value_counts())  # Adjust nama kolom

# Step 2: Preprocessing
# Encode kategorikal
le = LabelEncoder()
df['lokasi_preferensi'] = le.fit_transform(df['lokasi_preferensi'])

# Pisah fitur dan target
X = df[['budget', 'lokasi_preferensi', 'kunjungan_situs', 'inquiry_count']]
y = df['converted']

# Scale numerik (opsional, RF robust tapi bagus buat consistency)
scaler = StandardScaler()
X[['budget', 'kunjungan_situs', 'inquiry_count']] = scaler.fit_transform(X[['budget', 'kunjungan_situs', 'inquiry_count']])

# Feature Selection pake RFE (seperti paper, pilih top 3 fitur)
estimator = RandomForestClassifier(n_estimators=100, random_state=42)
rfe = RFE(estimator, n_features_to_select=3)
X_rfe = rfe.fit_transform(X, y)
selected_features = X.columns[rfe.support_]
print(f"Fitur terpilih: {selected_features}")

# Step 3: Split Data (80/20)
X_train, X_test, y_train, y_test = train_test_split(X_rfe, y, test_size=0.2, random_state=42, stratify=y)

# Step 4: Train Model Random Forest
rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)  # Hyperparam sederhana
rf_model.fit(X_train, y_train)

# Cross-validation (10-fold kayak paper)
cv_scores = cross_val_score(rf_model, X_train, y_train, cv=10, scoring='accuracy')
print(f"CV Accuracy: {cv_scores.mean():.2f} (+/- {cv_scores.std() * 2:.2f})")

# Step 5: Prediksi dan Evaluasi
y_pred = rf_model.predict(X_test)
y_prob = rf_model.predict_proba(X_test)[:, 1]  # Probabilitas untuk scoring

print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print(f"ROC-AUC: {roc_auc_score(y_test, y_prob):.2f}")

# Feature Importance (visualisasi kayak tutorial Hex)
importances = rf_model.feature_importances_
plt.bar(selected_features, importances)
plt.title('Feature Importance')
plt.xticks(rotation=45)
plt.show()

# Contoh Scoring: Lead baru [budget=500000000, lokasi=1 (Jakarta), kunjungan=5, inquiry=3]
new_lead = np.array([[500000000, 1, 5, 3]])  # Scale jika perlu
new_lead_scaled = scaler.transform(new_lead)  # Adjust sesuai RFE
score = rf_model.predict_proba(new_lead_scaled)[:, 1][0]
print(f"Lead Score: {score:.2f} (Threshold >0.2 = Hot Lead)")

         Lead Number    Converted  TotalVisits  Total Time Spent on Website  \
count    9240.000000  9240.000000  9103.000000                  9240.000000   
mean   617188.435606     0.385390     3.445238                   487.698268   
std     23405.995698     0.486714     4.854853                   548.021466   
min    579533.000000     0.000000     0.000000                     0.000000   
25%    596484.500000     0.000000     1.000000                    12.000000   
50%    615479.000000     0.000000     3.000000                   248.000000   
75%    637387.250000     1.000000     5.000000                   936.000000   
max    660737.000000     1.000000   251.000000                  2272.000000   

       Page Views Per Visit  Asymmetrique Activity Score  \
count           9103.000000                  5022.000000   
mean               2.362820                    14.306252   
std                2.161418                     1.386694   
min                0.000000                     

KeyError: 'converted'