In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
import joblib

print("Semua library berhasil di-import!")

Semua library berhasil di-import!


In [2]:
# Ganti nama file ini
nama_file = 'spotify_churn_dataset.csv' # ATAU APAPUN NAMA FILENYA

try:
    df = pd.read_csv(nama_file)
    print("Dataset berhasil di-load!")
    print("\n5 baris pertama data:")
    print(df.head())
    
    print("\nInfo kolom:")
    df.info()
    
except FileNotFoundError:
    print(f"Error: File '{nama_file}' tidak ditemukan. Cek lagi nama filenya, bro.")

Dataset berhasil di-load!

5 baris pertama data:
  user_id subscription_type country  avg_daily_minutes  number_of_playlists  \
0  user_1           Premium      US              134.9                    4   
1  user_2           Premium      PK              165.7                    5   
2  user_3              Free      DE               45.9                    3   
3  user_4           Premium      PK              106.0                    0   
4  user_5           Premium      US               89.6                    5   

    top_genre  skips_per_day  support_tickets  days_since_last_login  churned  
0  Electronic              6                0                      1        0  
1         Pop              8                0                     12        0  
2   Classical              3                0                      3        0  
3        Jazz              7                0                      3        0  
4     Country              2                1                      6        

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   user_id                1000 non-null   object 
 1   subscription_type      1000 non-null   object 
 2   country                1000 non-null   object 
 3   avg_daily_minutes      1000 non-null   float64
 4   number_of_playlists    1000 non-null   int64  
 5   top_genre              1000 non-null   object 
 6   skips_per_day          1000 non-null   int64  
 7   support_tickets        1000 non-null   int64  
 8   days_since_last_login  1000 non-null   int64  
 9   churned                1000 non-null   int64  
dtypes: float64(1), int64(5), object(4)
memory usage: 78.2+ KB


In [5]:
if 'user_id' in df.columns:
    df = df.drop('user_id', axis=1)

In [6]:
# Tentukan target kita
target = 'churned'

# (PENTING) Sesuaikan nama-nama kolom ini berdasarkan hasil df.info() kamu
# Kolom angka
kolom_numerik = ['number_of_playlists', 'skips_per_day', 'support_tickets','days_since_last_login', 'avg_daily_minutes']

# Kolom teks/kategori
kolom_kategorikal = ['country', 'subscription_type', 'top_genre']

print(f"Kolom Angka: {kolom_numerik}")
print(f"Kolom Kategori: {kolom_kategorikal}")
print(f"Target: {target}")

Kolom Angka: ['number_of_playlists', 'skips_per_day', 'support_tickets', 'days_since_last_login', 'avg_daily_minutes']
Kolom Kategori: ['country', 'subscription_type', 'top_genre']
Target: churned


In [7]:
# Bikin 'pipa' buat data angka: Isi data kosong (median), lalu standarisasi (scaling)
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Bikin 'pipa' buat data kategori: Ubah jadi angka (One-Hot Encoding)
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Gabungin kedua 'pipa' itu
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, kolom_numerik),
        ('cat', categorical_transformer, kolom_kategorikal)
    ])

print("Pabrik 'preprocessor' siap!")

Pabrik 'preprocessor' siap!


In [8]:
# Ini model kita
model_xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Ini alur kerja lengkapnya:
# 1. Data masuk -> 2. Diproses di 'preprocessor' -> 3. Hasilnya diprediksi oleh 'model_xgb'
full_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                ('model', model_xgb)
                               ])

print("Pipeline lengkap (Pabrik + Model) siap!")

Pipeline lengkap (Pabrik + Model) siap!


In [9]:
# Pisahin data (X) dan target (y)
X = df.drop(target, axis=1)
y = df[target]

# Bagi data: 80% buat latihan, 20% buat tes
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Total data: {len(df)}")
print(f"Data latih (train): {len(X_train)}")
print(f"Data tes: {len(X_test)}")

print("\n--- MULAI TRAINING MODEL ---")
# Latih 'otak'-nya pakai data latih
full_pipeline.fit(X_train, y_train)
print("--- TRAINING SELESAI ---")

Total data: 1000
Data latih (train): 800
Data tes: 200

--- MULAI TRAINING MODEL ---


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


--- TRAINING SELESAI ---


In [10]:
# Tes pakai data tes (data yg belum pernah dia liat)
akurasi = full_pipeline.score(X_test, y_test)

print(f"Akurasi model di data tes: {akurasi * 100:.2f}%")

Akurasi model di data tes: 81.00%


In [11]:
from sklearn.metrics import classification_report

#prediksi di data tes
y_pred = full_pipeline.predict(X_test)

# Tampilkan laporan lengkap
print("\n--- LAPORAN KLASIFIKASI ---")
# 0 = Aman, 1 = Churn
print(classification_report(y_test, y_pred, target_names=['Aman (0)', 'Churn (1)']))


--- LAPORAN KLASIFIKASI ---
              precision    recall  f1-score   support

    Aman (0)       0.88      0.89      0.88       164
   Churn (1)       0.47      0.44      0.46        36

    accuracy                           0.81       200
   macro avg       0.68      0.67      0.67       200
weighted avg       0.81      0.81      0.81       200



In [14]:
# Simpan seluruh pipeline (Pabrik + Model)
nama_file_model = 'model_spotify_churn2.pkl'
joblib.dump(full_pipeline, nama_file_model)

print(f"\nModel lengkap (termasuk preprocessor) berhasil disimpan sebagai: {nama_file_model}")


Model lengkap (termasuk preprocessor) berhasil disimpan sebagai: model_spotify_churn2.pkl
