# Analisis Turnamen MLBB (2018–2025)

## CRISP-DM — Business Understanding

**Tujuan bisnis:** Mengelompokkan hero berdasarkan statistik turnamen (pick, win, ban) untuk mengidentifikasi kategori meta yang bermanfaat bagi analis esports dan tim: *Meta*, *Non-meta*, dan *Situational*.

**Kriteria keberhasilan:** Hasil clustering (k=3) harus dapat dijelaskan secara jelas (centroid & contoh hero), memiliki metrik evaluasi (Inertia dan Silhouette Score) dan disimpan sebagai berkas output.

**Pemangku kepentingan:** Peneliti ML, analis esports, pelatih tim.

---

Notebook ini memuat tahapan:
1. Pemuatan data
2. Pemeriksaan & pembersihan
3. EDA (2 line plot, 2 box plot, 2 pie chart, scatter plot, correlation matrix)
4. Modelling: K-Means (k=3) dan visualisasi PCA
5. Evaluasi: Inertia, Silhouette Score, interpretasi cluster dalam Bahasa Indonesia
6. Menyimpan hasil (CSV)

Catatan: pastikan file CSV Anda bernama `mlbb_dataset_normalized.csv` dan berada di folder yang sama dengan notebook ini.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

sns.set(style='whitegrid')
plt.rcParams['figure.figsize'] = (10,6)


In [None]:
# Ubah nama file jika berbeda
FILENAME = 'mlbb_dataset_normalized.csv'

# Muat dataset
try:
    df = pd.read_csv(FILENAME)
    print('Berhasil memuat:', FILENAME)
    print('Jumlah baris:', len(df))
except Exception as e:
    raise SystemExit(f"Gagal memuat {FILENAME}: {e}")

# Tampilkan preview
df.head()

In [None]:
# Pemeriksaan dasar
print('\nInformasi dataframe:')
df.info()

# Periksa kolom numerik yang diperlukan
numeric_cols = ['pick_total','pick_wins','pick_losses','ban_count','win_rate']
missing = [c for c in numeric_cols if c not in df.columns]
if missing:
    raise SystemExit(f"Kolom numerik yang diperlukan tidak ditemukan: {missing}")

# Pastikan tipe numerik
for c in ['pick_total','pick_wins','pick_losses','ban_count']:
    df[c] = pd.to_numeric(df[c], errors='coerce').fillna(0).astype(int)

# Win rate sebagai float
df['win_rate'] = pd.to_numeric(df['win_rate'], errors='coerce').fillna(0.0)

# Deteksi kolom role yang dinormalisasi
role_col = None
for candidate in ['Role_Normalized','Role_normalized','Role','role']:
    if candidate in df.columns:
        role_col = candidate
        break

print('\nMenggunakan kolom role:', role_col)

# Isi nilai kosong pada role/lane
if role_col:
    df[role_col] = df[role_col].fillna('Other')
if 'Lane' in df.columns:
    df['Lane'] = df['Lane'].fillna('Unknown')

# Tampilkan ringkasan statistik
print('\nRingkasan numerik:')
print(df[numeric_cols].describe())


In [None]:
# Line plot 1: Total pick per year
plt.figure(figsize=(10,4))
year_pick = df.groupby('tournament_year')['pick_total'].sum().sort_index()
plt.plot(year_pick.index, year_pick.values, marker='o')
plt.title('Total pick_total per tahun')
plt.xlabel('Tahun turnamen')
plt.ylabel('Jumlah pick (total)')
plt.grid(True)
plt.show()

print('Deskripsi: Grafik ini menunjukkan total pick yang tercatat tiap tahun. Lonjakan dapat menandakan lebih banyak pertandingan/turnamen pada tahun tersebut.')

# Line plot 2: Total ban per year
plt.figure(figsize=(10,4))
year_ban = df.groupby('tournament_year')['ban_count'].sum().sort_index()
plt.plot(year_ban.index, year_ban.values, marker='o', color='red')
plt.title('Total ban_count per tahun')
plt.xlabel('Tahun turnamen')
plt.ylabel('Jumlah ban (total)')
plt.grid(True)
plt.show()

print('Deskripsi: Grafik ini menunjukkan aktivitas ban antar tahun.')

In [None]:
# Box plot 1: pick_total grouped by role
if role_col:
    plt.figure(figsize=(12,5))
    order = df[role_col].value_counts().index
    sns.boxplot(x=role_col, y='pick_total', data=df, order=order)
    plt.title('Distribusi pick_total berdasarkan role (normalisasi)')
    plt.xlabel('Role')
    plt.ylabel('pick_total')
    plt.xticks(rotation=45)
    plt.show()
    print('Deskripsi: Menunjukkan sebaran pick_total per role. Perhatikan outlier untuk role tertentu.')
else:
    print('Kolom role tidak tersedia, melewati boxplot role.')

# Box plot 2: win_rate grouped by role
if role_col:
    plt.figure(figsize=(12,5))
    order = df[role_col].value_counts().index
    sns.boxplot(x=role_col, y='win_rate', data=df, order=order)
    plt.title('Distribusi win_rate berdasarkan role (normalisasi)')
    plt.xlabel('Role')
    plt.ylabel('win_rate (%)')
    plt.xticks(rotation=45)
    plt.show()
    print('Deskripsi: Periksa apakah role tertentu memiliki win_rate lebih tinggi secara konsisten.')
else:
    print('Kolom role tidak tersedia, melewati boxplot win_rate.')

In [None]:
# Pie chart 1: Role distribution
if role_col:
    role_counts = df[role_col].value_counts()
    plt.figure(figsize=(6,6))
    role_counts.plot.pie(autopct='%1.1f%%')
    plt.title('Distribusi Role')
    plt.ylabel('')
    plt.show()
    print('Deskripsi: Menampilkan proporsi tiap role pada dataset.')
else:
    print('Role column not found; skipping role pie chart.')

# Pie chart 2: Lane distribution
if 'Lane' in df.columns:
    lane_counts = df['Lane'].value_counts()
    plt.figure(figsize=(6,6))
    lane_counts.plot.pie(autopct='%1.1f%%')
    plt.title('Distribusi Lane')
    plt.ylabel('')
    plt.show()
    print('Deskripsi: Menampilkan proporsi lane (Gold/Exp/Mid/Roam dll).')
else:
    print('Kolom Lane tidak ditemukan; melewati pie chart lane.')

In [None]:
# Scatter plot: pick_total vs win_rate (size = ban_count)
plt.figure(figsize=(10,6))
sizes = (df['ban_count'].values + 1) * 10
scatter = plt.scatter(df['pick_total'], df['win_rate'], s=sizes, alpha=0.7, c=df['ban_count'], cmap='viridis')
plt.xlabel('pick_total')
plt.ylabel('win_rate')
plt.title('pick_total vs win_rate (ukuran ~ ban_count)')
cbar = plt.colorbar(scatter)
cbar.set_label('ban_count')
plt.grid(True)
plt.show()

print('Deskripsi: Mark besar = hero yang sering diban. Perhatikan hero dengan pick tinggi dan win tinggi yang juga sering diban (meta-dominant).')

# Correlation matrix
numeric_cols = ['pick_total','pick_wins','pick_losses','ban_count','win_rate']
plt.figure(figsize=(8,6))
sns.heatmap(df[numeric_cols].corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation matrix')
plt.show()

print('Deskripsi: Matriks korelasi membantu melihat hubungan antar fitur, mis. pick_total vs pick_wins sangat berkorelasi.')

In [None]:
# Modeling: K-Means dengan k=3
features = df[numeric_cols].copy()
scaler = StandardScaler()
X_scaled = scaler.fit_transform(features)

# Elbow optional (we keep but user requested k=3)
inertia = []
K_range = range(2,8)
for k in K_range:
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    km.fit(X_scaled)
    inertia.append(km.inertia_)

plt.figure(figsize=(8,4))
plt.plot(list(K_range), inertia, marker='o')
plt.xlabel('k')
plt.ylabel('Inertia')
plt.title('Elbow Method (lihat, tetapi kita paksa k=3)')
plt.grid(True)
plt.show()

# Fit KMeans dengan k=3
k = 3
kmeans = KMeans(n_clusters=k, random_state=42, n_init=20)
labels = kmeans.fit_predict(X_scaled)
df['cluster'] = labels
print('Clustering selesai. Label cluster ditambahkan pada dataframe.')


In [None]:
# PCA 2D untuk visualisasi
pca = PCA(n_components=2, random_state=42)
pca_vals = pca.fit_transform(X_scaled)
df['pca1'] = pca_vals[:,0]
df['pca2'] = pca_vals[:,1]

plt.figure(figsize=(10,6))
sns.scatterplot(data=df, x='pca1', y='pca2', hue='cluster', palette='tab10', s=60, alpha=0.8)
plt.title('PCA: Visualisasi cluster (k=3)')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.legend(title='cluster')
plt.show()

# Tentukan label cluster manual berdasarkan centroid nanti
cluster_name_map = {}  # will fill after centroid analysis


In [None]:
# Centroid (k=3) dalam ruang asli
centroids_scaled = kmeans.cluster_centers_
centroids_orig = scaler.inverse_transform(centroids_scaled)
centroid_df = pd.DataFrame(centroids_orig, columns=numeric_cols)
centroid_df['cluster'] = range(centroid_df.shape[0])

# Tambahkan deskripsi heuristik untuk tiap cluster
centroid_df['description'] = ''
quartiles = {c: df[c].quantile([0.25,0.5,0.75]).values for c in numeric_cols}

for i, row in centroid_df.iterrows():
    pick = row['pick_total']
    ban = row['ban_count']
    win = row['win_rate']
    desc = []
    # Heuristik sederhana
    if (pick >= df['pick_total'].quantile(0.66)) and (ban >= df['ban_count'].quantile(0.66)):
        desc.append('Meta: pick tinggi & ban tinggi')
    if (pick >= df['pick_total'].quantile(0.66)) and (win >= df['win_rate'].quantile(0.66)):
        desc.append('Meta: pick tinggi & win tinggi')
    if (pick <= df['pick_total'].quantile(0.33)) and (win >= df['win_rate'].quantile(0.66)):
        desc.append('Situational: pick rendah tetapi win tinggi (sample kecil mungkin)')
    if (pick <= df['pick_total'].quantile(0.33)) and (win <= df['win_rate'].quantile(0.33)):
        desc.append('Non-meta: pick rendah & win rendah')
    if not desc:
        desc.append('Perilaku campuran / rata-rata')
    centroid_df.at[i,'description'] = '; '.join(desc)

centroid_df = centroid_df[['cluster'] + numeric_cols + ['description']]
centroid_df


In [None]:
# Mapping cluster index to nama cluster (Meta / Non-meta / Situational)
# We determine mapping by simple rules based on centroid values computed above.

def map_cluster_name(centroid_df):
    mapping = {}
    # find cluster that mentions 'Meta' in description -> Meta
    for _, r in centroid_df.iterrows():
        if 'Meta' in r['description']:
            mapping[int(r['cluster'])] = 'Meta'
    # find cluster that mentions 'Situational'
    for _, r in centroid_df.iterrows():
        if 'Situational' in r['description']:
            mapping[int(r['cluster'])] = 'Situational'
    # remaining cluster -> Non-meta
    for c in centroid_df['cluster']:
        if int(c) not in mapping:
            mapping[int(c)] = 'Non-meta'
    return mapping

cluster_name_map = map_cluster_name(centroid_df)
print('Mapping cluster -> label:', cluster_name_map)

df['cluster_label'] = df['cluster'].map(cluster_name_map)
df[['hero','pick_total','ban_count','win_rate','cluster','cluster_label']].head(10)


In [None]:
# Evaluasi clustering
inertia_value = kmeans.inertia_
sil_score = silhouette_score(X_scaled, df['cluster'])
cluster_sizes = df['cluster'].value_counts().sort_index()

print('Inertia (SSE):', round(inertia_value,2))
print('Silhouette Score:', round(sil_score,4))
print('\nUkuran tiap cluster:')
print(cluster_sizes)

print('\nCatatan evaluasi:')
print('- Inertia lebih kecil berarti cluster lebih kompak; nilai absolut bergantung pada skala data.')
print('- Silhouette mendekati 1 berarti cluster terpisah baik; nilai di sekitar 0 menunjukkan overlap.')
print('- Periksa distribusi hero per cluster untuk memastikan cluster bermakna.')


In [None]:
# Tampilkan centroid dan deskripsi (Bahasa Indonesia)
print('\nCentroid cluster (nilai rata-rata fitur pada tiap cluster):')
display(centroid_df)

# Representative heroes per cluster (top by pick_total)
for c in sorted(df['cluster'].unique()):
    label = cluster_name_map.get(c, f'Cluster {c}')
    print('\n---')
    print(f'Cluster {c} -> {label}')
    print('Deskripsi centroid:', centroid_df[centroid_df['cluster']==c]['description'].values[0])
    subset = df[df['cluster']==c].sort_values('pick_total', ascending=False).head(10)
    print('Contoh hero teratas di cluster ini:')
    display(subset[['hero','tournament_year','pick_total','pick_wins','pick_losses','ban_count','win_rate']])


In [None]:
# Simpan hasil: dataframe dengan label cluster dan centroids
df.to_csv('mlbb_with_clusters.csv', index=False)
centroid_df.to_csv('cluster_centroids.csv', index=False)
print('Disimpan: mlbb_with_clusters.csv dan cluster_centroids.csv')

# Selesai
print('\nNotebook selesai. Ubah FILENAME jika perlu dan jalankan ulang sel jika dataset berbeda.')