In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# ÿ™ŸàŸÑ€åÿØ ÿØÿßÿØŸá‚ÄåŸáÿß€å ŸÖÿµŸÜŸàÿπ€å
np.random.seed(42)
num_samples = 20000  # ÿßŸÅÿ≤ÿß€åÿ¥ ÿ™ÿπÿØÿßÿØ ÿØÿßÿØŸá‚ÄåŸáÿß ÿ®ÿ±ÿß€å ÿ®Ÿáÿ®ŸàÿØ ŸÖÿØŸÑ

customer_id = np.random.randint(1000, 9999, num_samples)
call_duration = np.random.randint(30, 900, num_samples)
call_reason = np.random.choice(["ŸÖÿ¥⁄©ŸÑ ÿß€åŸÜÿ™ÿ±ŸÜÿ™", "Ÿæÿ¥ÿ™€åÿ®ÿßŸÜ€å", "ŸÖÿßŸÑ€å", "ÿ¥⁄©ÿß€åÿ™"], num_samples)
agent_id = np.random.randint(1, 50, num_samples)
response_time = np.random.randint(5, 180, num_samples)

def generate_satisfaction(call_duration, response_time, call_reason):
    score = 3
    if call_duration < 60:
        score = np.random.choice([1, 2], p=[0.6, 0.4])
    elif response_time > 120:
        score = np.random.choice([1, 2, 3], p=[0.5, 0.3, 0.2])
    elif call_reason == "ÿ¥⁄©ÿß€åÿ™":
        score = np.random.choice([1, 2, 3], p=[0.5, 0.3, 0.2])
    elif call_reason in ["Ÿæÿ¥ÿ™€åÿ®ÿßŸÜ€å", "ŸÖÿßŸÑ€å"]:
        score = np.random.choice([3, 4, 5], p=[0.2, 0.3, 0.5])
    return score

satisfaction_score = [generate_satisfaction(call_duration[i], response_time[i], call_reason[i]) for i in range(num_samples)]

df = pd.DataFrame({
    "customer_id": customer_id,
    "call_duration": call_duration,
    "call_reason": call_reason,
    "agent_id": agent_id,
    "response_time": response_time,
    "satisfaction_score": satisfaction_score
})

# ÿßÿ∂ÿßŸÅŸá ⁄©ÿ±ÿØŸÜ Ÿà€å⁄ò⁄Ø€å‚ÄåŸáÿß€å ÿ¨ÿØ€åÿØ
df["day_of_week"] = np.random.randint(0, 7, len(df))
df["call_hour"] = np.random.randint(8, 22, len(df))
agent_avg_response_time = {agent: np.random.randint(5, 150) for agent in df["agent_id"].unique()}
df["agent_avg_response_time"] = df["agent_id"].map(agent_avg_response_time)
df["response_to_duration_ratio"] = df["response_time"] / df["call_duration"]
df["agent_avg_satisfaction"] = df.groupby("agent_id")["satisfaction_score"].transform("mean")

# ÿ™ÿ®ÿØ€åŸÑ ÿØÿßÿØŸá‚ÄåŸáÿß€å ÿØÿ≥ÿ™Ÿá‚Äåÿß€å ÿ®Ÿá ÿπÿØÿØ€å
df["call_reason"] = LabelEncoder().fit_transform(df["call_reason"])

# ŸÜÿ±ŸÖÿßŸÑ‚Äåÿ≥ÿßÿ≤€å ÿØÿßÿØŸá‚ÄåŸáÿß
scaler = StandardScaler()
numeric_features = ["call_duration", "response_time", "agent_avg_response_time", "response_to_duration_ratio", "agent_avg_satisfaction"]
df[numeric_features] = scaler.fit_transform(df[numeric_features])

# ÿ™ŸÜÿ∏€åŸÖ ÿÆÿ±Ÿàÿ¨€å ŸÖÿØŸÑ
y = df["satisfaction_score"] - 1  # ÿ™ÿ®ÿØ€åŸÑ ÿ®ÿßÿ≤Ÿá 1-5 ÿ®Ÿá 0-4
X = df.drop(columns=["satisfaction_score", "customer_id"])

# ÿ™ŸÇÿ≥€åŸÖ ÿØÿßÿØŸá‚ÄåŸáÿß
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ÿ™ŸÜÿ∏€åŸÖÿßÿ™ ÿ®Ÿá€åŸÜŸá RandomForest
param_grid_rf = {
    'n_estimators': [100, 300],
    'max_depth': [8, 12],
    'min_samples_split': [2, 5]
}
grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=42, class_weight='balanced'), param_grid_rf, cv=3, scoring='accuracy', n_jobs=-1)
grid_search_rf.fit(X_train, y_train)
best_rf_params = grid_search_rf.best_params_

# ÿ™ŸÜÿ∏€åŸÖÿßÿ™ ÿ®Ÿá€åŸÜŸá XGBoost
param_grid_xgb = {
    'n_estimators': [100, 300, 500],
    'max_depth': [6, 10, 14],
    'learning_rate': [0.05, 0.1, 0.2],
    'scale_pos_weight': [1, 2]
}
grid_search_xgb = GridSearchCV(XGBClassifier(random_state=42), param_grid_xgb, cv=3, scoring='accuracy', n_jobs=-1)
grid_search_xgb.fit(X_train, y_train)
best_xgb_params = grid_search_xgb.best_params_

# ŸÖÿØŸÑ LightGBM
lgb_model = LGBMClassifier(n_estimators=300, max_depth=12, learning_rate=0.1, random_state=42)

# ŸÖÿØŸÑ‚ÄåŸáÿß ÿ®ÿß ÿ™ŸÜÿ∏€åŸÖÿßÿ™ ÿ®Ÿá€åŸÜŸá
rf_model = RandomForestClassifier(**best_rf_params, class_weight='balanced', random_state=42)
xgb_model = XGBClassifier(**best_xgb_params, random_state=42)

# ÿßÿ¨ÿ±ÿß€å Ensemble Learning
ensemble_model = VotingClassifier(estimators=[
    ('rf', rf_model),
    ('xgb', xgb_model),
    ('lgb', lgb_model)
], voting='soft', weights=[1, 1, 3])

ensemble_model.fit(X_train, y_train)
y_pred_ensemble = ensemble_model.predict(X_test)

# ŸÜŸÖÿß€åÿ¥ ÿØŸÇÿ™ ŸÜŸáÿß€å€å
accuracy = accuracy_score(y_test, y_pred_ensemble)
print(f"üîπ Accuracy ÿ®ÿß Ensemble Learning: {accuracy:.2f}")


Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000203 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 853
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 9
[LightGBM] [Info] Start training from score -1.336790
[LightGBM] [Info] Start training from score -1.815924
[LightGBM] [Info] Start training from score -1.152211
[LightGBM] [Info] Start training from score -2.322150
[LightGBM] [Info] Start training from score -1.828683
üîπ Accuracy ÿ®ÿß Ensemble Learning: 0.54
