In [None]:
!pip install optuna
!apt-get update -qq
!apt-get install fonts-nanum -qq
!fc-cache -fv
!rm ~/.cache/matplotlib -rf
!pip install minisom
!pip install catboost
!pip install xgboost lightgbm catboost minisom tensorflow
!pip install optuna minisom catboost

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.font_manager as fm


font_path = '/usr/share/fonts/truetype/nanum/NanumGothic.ttf'
fontprop = fm.FontProperties(fname=font_path, size=10)
plt.rcParams['font.family'] = 'NanumGothic'
plt.rcParams['axes.unicode_minus'] = False

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
/usr/share/fonts: caching, new cache contents: 0 fonts, 1 dirs
/usr/share/fonts/truetype: caching, new cache contents: 0 fonts, 3 dirs
/usr/share/fonts/truetype/humor-sans: caching, new cache contents: 1 fonts, 0 dirs
/usr/share/fonts/truetype/liberation: caching, new cache contents: 16 fonts, 0 dirs
/usr/share/fonts/truetype/nanum: caching, new cache contents: 12 fonts, 0 dirs
/usr/local/share/fonts: caching, new cache contents: 0 fonts, 0 dirs
/root/.local/share/fonts: skipping, no such directory
/root/.fonts: skipping, no such directory
/usr/share/fonts/truetype: skipping, looped directory detected
/usr/share/fonts/truetype/humor-sans: skipping, looped directory detected
/usr/share/fonts/truetype/liberation: skipping, looped directory detected
/usr/share/fonts/truetype/nanum: skipping, looped dire

In [None]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import optuna
import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv('/content/Political_Party_Churn_Data.csv')
df = df.drop(['MemberID'], axis=1)
for col in df.select_dtypes(include='object'):
    df[col] = LabelEncoder().fit_transform(df[col])
X = df.drop('Churn', axis=1)
y = df['Churn']

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y)
X_train_arr, X_test_arr = X_train, X_test
results = {}

from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier, Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.cluster import KMeans
from scipy.stats import mode
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.neural_network import MLPClassifier
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, SimpleRNN, Reshape
from minisom import MiniSom

def cluster_to_label(y_true, cluster_pred):
    labels = np.zeros_like(cluster_pred)
    for i in range(len(np.unique(cluster_pred))):
        mask = (cluster_pred == i)
        if np.sum(mask) > 0:
            labels[mask] = mode(y_true[mask], keepdims=True)[0]
    return labels

# ---- 1. Linear Models ----
def objective_logreg(trial):
    c = trial.suggest_float('C', 0.001, 10, log=True)
    solver = trial.suggest_categorical('solver', ['lbfgs', 'liblinear'])
    model = LogisticRegression(max_iter=500, C=c, solver=solver)
    model.fit(X_train_arr, y_train)
    return accuracy_score(y_test, model.predict(X_test_arr))
study_logreg = optuna.create_study(direction='maximize')
study_logreg.optimize(objective_logreg, n_trials=3)
results['LogisticRegression'] = study_logreg.best_value

def objective_ridge(trial):
    alpha = trial.suggest_float('alpha', 0.01, 10, log=True)
    model = RidgeClassifier(alpha=alpha)
    model.fit(X_train_arr, y_train)
    return accuracy_score(y_test, model.predict(X_test_arr))
study_ridge = optuna.create_study(direction='maximize')
study_ridge.optimize(objective_ridge, n_trials=3)
results['RidgeClassifier'] = study_ridge.best_value

def objective_sgd(trial):
    alpha = trial.suggest_float('alpha', 1e-5, 1e-1, log=True)
    loss = trial.suggest_categorical('loss', ['hinge', 'log_loss'])
    model = SGDClassifier(max_iter=1000, tol=1e-3, alpha=alpha, loss=loss)
    model.fit(X_train_arr, y_train)
    return accuracy_score(y_test, model.predict(X_test_arr))
study_sgd = optuna.create_study(direction='maximize')
study_sgd.optimize(objective_sgd, n_trials=3)
results['SGDClassifier'] = study_sgd.best_value

def objective_perceptron(trial):
    eta0 = trial.suggest_float('eta0', 1e-4, 1.0, log=True)
    model = Perceptron(max_iter=1000, eta0=eta0)
    model.fit(X_train_arr, y_train)
    return accuracy_score(y_test, model.predict(X_test_arr))
study_perceptron = optuna.create_study(direction='maximize')
study_perceptron.optimize(objective_perceptron, n_trials=3)
results['Perceptron'] = study_perceptron.best_value

# ---- 2. Decision Tree ----
def objective_tree(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 2, 16),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 8),
        'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy'])
    }
    model = DecisionTreeClassifier(**params, random_state=42)
    model.fit(X_train_arr, y_train)
    return accuracy_score(y_test, model.predict(X_test_arr))
study_tree = optuna.create_study(direction="maximize")
study_tree.optimize(objective_tree, n_trials=3)
results['DecisionTree'] = study_tree.best_value

# ---- 3. Ensemble ----
def objective_rf(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 100),
        'max_depth': trial.suggest_int('max_depth', 2, 8),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 6),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 4)
    }
    rf = RandomForestClassifier(**params, random_state=42)
    rf.fit(X_train_arr, y_train)
    preds = rf.predict(X_test_arr)
    acc = accuracy_score(y_test, preds)
    return acc
study_rf = optuna.create_study(direction="maximize")
study_rf.optimize(objective_rf, n_trials=3)
results['RandomForest'] = study_rf.best_value

def objective_gb(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 100),
        'max_depth': trial.suggest_int('max_depth', 2, 8),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2)
    }
    model = GradientBoostingClassifier(**params)
    model.fit(X_train_arr, y_train)
    return accuracy_score(y_test, model.predict(X_test_arr))
study_gb = optuna.create_study(direction='maximize')
study_gb.optimize(objective_gb, n_trials=3)
results['GradientBoosting'] = study_gb.best_value

def objective_ada(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 100),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0)
    }
    model = AdaBoostClassifier(**params)
    model.fit(X_train_arr, y_train)
    return accuracy_score(y_test, model.predict(X_test_arr))
study_ada = optuna.create_study(direction='maximize')
study_ada.optimize(objective_ada, n_trials=3)
results['AdaBoost'] = study_ada.best_value

# ---- 4. SVC ----
def objective_svc(trial):
    c = trial.suggest_float('C', 0.01, 10, log=True)
    kernel = trial.suggest_categorical('kernel', ['linear', 'rbf', 'poly'])
    model = SVC(C=c, kernel=kernel, probability=True)
    model.fit(X_train_arr, y_train)
    return accuracy_score(y_test, model.predict(X_test_arr))
study_svc = optuna.create_study(direction='maximize')
study_svc.optimize(objective_svc, n_trials=3)
results['SVC'] = study_svc.best_value

# ---- 5. KMeans ----
def objective_kmeans(trial):
    n_clusters = trial.suggest_int('n_clusters', 2, 4)
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(X_train_arr)
    cluster_pred = kmeans.predict(X_test_arr)
    pred_labels = cluster_to_label(y_test.values, cluster_pred)
    return accuracy_score(y_test, pred_labels)
study_kmeans = optuna.create_study(direction='maximize')
study_kmeans.optimize(objective_kmeans, n_trials=3)
results['KMeans'] = study_kmeans.best_value

# ---- 6. Boosting ----
def objective_xgb(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 100),
        'max_depth': trial.suggest_int('max_depth', 2, 8),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'use_label_encoder': False,
        'eval_metric': 'logloss',
        'verbosity': 0
    }
    model = XGBClassifier(**params)
    model.fit(X_train_arr, y_train)
    pred = model.predict(X_test_arr)
    return accuracy_score(y_test, pred)
study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(objective_xgb, n_trials=3)
results['XGBoost'] = study_xgb.best_value

def objective_lgbm(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 100),
        'max_depth': trial.suggest_int('max_depth', 2, 8),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'num_leaves': trial.suggest_int('num_leaves', 10, 40)
    }
    model = LGBMClassifier(**params)
    model.fit(X_train_arr, y_train)
    pred = model.predict(X_test_arr)
    return accuracy_score(y_test, pred)
study_lgbm = optuna.create_study(direction='maximize')
study_lgbm.optimize(objective_lgbm, n_trials=3)
results['LightGBM'] = study_lgbm.best_value

def objective_cat(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 50, 100),
        'depth': trial.suggest_int('depth', 2, 8),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
    }
    model = CatBoostClassifier(**params, verbose=0)
    model.fit(X_train_arr, y_train)
    pred = model.predict(X_test_arr)
    return accuracy_score(y_test, pred)
study_cat = optuna.create_study(direction='maximize')
study_cat.optimize(objective_cat, n_trials=3)
results['CatBoost'] = study_cat.best_value

# ---- 7. sklearn MLP ----
def objective_mlp(trial):
    hidden_layer_sizes = tuple([trial.suggest_int(f'n_units_{i}', 32, 64) for i in range(2)])
    mlp = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, max_iter=200)
    mlp.fit(X_train_arr, y_train)
    pred = mlp.predict(X_test_arr)
    return accuracy_score(y_test, pred)
study_mlp = optuna.create_study(direction='maximize')
study_mlp.optimize(objective_mlp, n_trials=3)
results['MLP_sklearn'] = study_mlp.best_value

# ---- 8. Keras MLP, LSTM, RNN ----
def objective_keras(trial, model_type):
    tf.keras.backend.clear_session()
    model = Sequential()
    n1 = trial.suggest_int('n1', 32, 64)
    n2 = trial.suggest_int('n2', 16, 32)
    lr = trial.suggest_float('lr', 1e-4, 1e-2, log=True)
    if model_type == 'MLP':
        model.add(Dense(n1, activation='relu', input_shape=(X_train_arr.shape[1],)))
        model.add(Dense(n2, activation='relu'))
        model.add(Dense(1, activation='sigmoid'))
    elif model_type == 'LSTM':
        model.add(Reshape((X_train_arr.shape[1], 1), input_shape=(X_train_arr.shape[1],)))
        model.add(LSTM(n1))
        model.add(Dense(1, activation='sigmoid'))
    elif model_type == 'RNN':
        model.add(Reshape((X_train_arr.shape[1], 1), input_shape=(X_train_arr.shape[1],)))
        model.add(SimpleRNN(n1))
        model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr), loss='binary_crossentropy')
    model.fit(X_train_arr, y_train, epochs=3, batch_size=32, verbose=0)
    pred = model.predict(X_test_arr)
    pred_label = (pred.flatten() > 0.5).astype(int)
    return accuracy_score(y_test, pred_label)

study_keras_mlp = optuna.create_study(direction='maximize')
study_keras_mlp.optimize(lambda trial: objective_keras(trial, 'MLP'), n_trials=3)
results['MLP_Keras'] = study_keras_mlp.best_value

study_keras_lstm = optuna.create_study(direction='maximize')
study_keras_lstm.optimize(lambda trial: objective_keras(trial, 'LSTM'), n_trials=3)
results['LSTM_Keras'] = study_keras_lstm.best_value

study_keras_rnn = optuna.create_study(direction='maximize')
study_keras_rnn.optimize(lambda trial: objective_keras(trial, 'RNN'), n_trials=3)
results['RNN_Keras'] = study_keras_rnn.best_value

# ---- 9. MiniSom ----
def objective_som(trial):
    x = trial.suggest_int('x', 1, 2)
    y = trial.suggest_int('y', 2, 3)
    sigma = trial.suggest_float('sigma', 0.1, 1.0)
    lr = trial.suggest_float('learning_rate', 0.01, 1.0)
    som = MiniSom(x=x, y=y, input_len=X_train_arr.shape[1], sigma=sigma, learning_rate=lr, random_seed=42)
    som.train(X_train_arr, 200)
    def som_predict(som, X):
        pred = []
        for xx in X:
            win = som.winner(xx)
            cluster = win[1]
            pred.append(cluster)
        return np.array(pred)
    som_pred = som_predict(som, X_test_arr)
    som_pred_labels = cluster_to_label(y_test.values, som_pred)
    return accuracy_score(y_test, som_pred_labels)
study_som = optuna.create_study(direction='maximize')
study_som.optimize(objective_som, n_trials=3)
results['MiniSom'] = study_som.best_value

# --- 정확도 시각화 ---
plt.figure(figsize=(11, 9))
sorted_results = dict(sorted(results.items(), key=lambda x: x[1]))
plt.barh(list(sorted_results.keys()), list(sorted_results.values()), color='skyblue')
plt.xlabel('accuracy_score', fontsize=14)
plt.title('Optuna 튜닝 모델별 Churn 예측 정확도', fontsize=16)
plt.xlim(0,1)
for i, (name, acc) in enumerate(sorted_results.items()):
    plt.text(acc+0.01, i, f'{acc:.3f}', va='center', fontsize=12)
plt.tight_layout()
plt.show()


In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 12))
roc_auc_dict = {}

def get_proba_or_score(model, X):
    if hasattr(model, "predict_proba"):
        return model.predict_proba(X)[:,1]
    elif hasattr(model, "decision_function"):
        return model.decision_function(X)
    else:
        return model.predict(X)

model_dict = {
    "LogisticRegression_Optuna": LogisticRegression(**study_logreg.best_trial.params, max_iter=500),
    "RidgeClassifier_Optuna": RidgeClassifier(**study_ridge.best_trial.params),
    "SGDClassifier_Optuna": SGDClassifier(**study_sgd.best_trial.params),
    "Perceptron_Optuna": Perceptron(**study_perceptron.best_trial.params, max_iter=1000),
    "DecisionTree_Optuna": DecisionTreeClassifier(**study_tree.best_trial.params, random_state=42),
    "RandomForest_Optuna": RandomForestClassifier(**study_rf.best_trial.params, random_state=42),
    "GradientBoosting_Optuna": GradientBoostingClassifier(**study_gb.best_trial.params),
    "AdaBoost_Optuna": AdaBoostClassifier(**study_ada.best_trial.params),
    "SVC_Optuna": SVC(**study_svc.best_trial.params, probability=True),
    "XGBoost_Optuna": XGBClassifier(**study_xgb.best_trial.params, use_label_encoder=False, eval_metric='logloss', verbosity=0),
    "LightGBM_Optuna": LGBMClassifier(**study_lgbm.best_trial.params),
    "CatBoost_Optuna": CatBoostClassifier(**study_cat.best_trial.params, verbose=0),
}

for name, model in model_dict.items():
    model.fit(X_train_arr, y_train)
    try:
        y_score = get_proba_or_score(model, X_test_arr)
        fpr, tpr, _ = roc_curve(y_test, y_score)
        auc = roc_auc_score(y_test, y_score)
        # _Optuna 및 _ → " " 로 변환
        clean_label = name.replace("_Optuna", "").replace("_", " ")
        plt.plot(fpr, tpr, lw=2, label=f"{clean_label} (AUC={auc:.3f})")
        roc_auc_dict[name] = auc
    except Exception as e:
        print(f"ROC Error in {name}: {e}")

# KMeans
try:
    n_clusters = study_kmeans.best_trial.params['n_clusters']
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(X_train_arr)
    cluster_pred = kmeans.predict(X_test_arr)
    kmeans_label = cluster_to_label(y_test.values, cluster_pred)
    auc = roc_auc_score(y_test, kmeans_label)
    fpr, tpr, _ = roc_curve(y_test, kmeans_label)
    plt.plot(fpr, tpr, lw=2, label=f"KMeans (AUC={auc:.3f})")   # 수정
    roc_auc_dict["KMeans_Optuna"] = auc
except Exception as e:
    print("KMeans ROC Error:", e)

# MiniSom
try:
    som_params = study_som.best_trial.params
    som = MiniSom(x=som_params['x'], y=som_params['y'], input_len=X_train_arr.shape[1],
                  sigma=som_params['sigma'], learning_rate=som_params['learning_rate'], random_seed=42)
    som.train(X_train_arr, 200)
    def som_predict(som, X):
        pred = []
        for xx in X:
            win = som.winner(xx)
            cluster = win[1]
            pred.append(cluster)
        return np.array(pred)
    som_pred = som_predict(som, X_test_arr)
    som_label = cluster_to_label(y_test.values, som_pred)
    auc = roc_auc_score(y_test, som_label)
    fpr, tpr, _ = roc_curve(y_test, som_label)
    plt.plot(fpr, tpr, lw=2, label=f"MiniSom (AUC={auc:.3f})")  # 수정
    roc_auc_dict["MiniSom_Optuna"] = auc
except Exception as e:
    print("MiniSom ROC Error:", e)

# Keras 딥러닝 (MLP, LSTM, RNN)
def fit_and_roc_keras(model_type, params, label):
    tf.keras.backend.clear_session()
    model = Sequential()
    n1 = params['n1'] if 'n1' in params else 64
    n2 = params['n2'] if 'n2' in params else 32
    lr = params['lr'] if 'lr' in params else 1e-3
    if model_type == 'MLP':
        model.add(Dense(n1, activation='relu', input_shape=(X_train_arr.shape[1],)))
        model.add(Dense(n2, activation='relu'))
        model.add(Dense(1, activation='sigmoid'))
    elif model_type == 'LSTM':
        model.add(Reshape((X_train_arr.shape[1], 1), input_shape=(X_train_arr.shape[1],)))
        model.add(LSTM(n1))
        model.add(Dense(1, activation='sigmoid'))
    elif model_type == 'RNN':
        model.add(Reshape((X_train_arr.shape[1], 1), input_shape=(X_train_arr.shape[1],)))
        model.add(SimpleRNN(n1))
        model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr), loss='binary_crossentropy')
    model.fit(X_train_arr, y_train, epochs=3, batch_size=32, verbose=0)
    y_score = model.predict(X_test_arr)
    auc = roc_auc_score(y_test, y_score)
    fpr, tpr, _ = roc_curve(y_test, y_score)
    label_clean = label.replace("_", " ")
    plt.plot(fpr, tpr, lw=2, label=f"{label_clean} (AUC={auc:.3f})")
    roc_auc_dict[label] = auc

fit_and_roc_keras('MLP', study_keras_mlp.best_trial.params, "MLP Keras")
fit_and_roc_keras('LSTM', study_keras_lstm.best_trial.params, "LSTM Keras")
fit_and_roc_keras('RNN', study_keras_rnn.best_trial.params, "RNN Keras")

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlabel('False Positive Rate', fontsize=13)
plt.ylabel('True Positive Rate', fontsize=13)
plt.title('모든 Optuna 튜닝 모델별 ROC Curve', fontsize=16)
plt.legend(fontsize=9)
plt.grid(alpha=0.2)
plt.tight_layout()
plt.show()
