In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import seaborn as sns
import eli5
import shap
from alibi.explainers import CounterFactual
import lime
from lime import lime_tabular

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.inspection import plot_partial_dependence


In [None]:
brainwave_df = pd.read_csv('emotions.csv')
brainwave_df.head()

In [None]:
brainwave_df.shape

In [None]:
plt.figure(figsize=(12,5))
sns.countplot(x=brainwave_df.label, color='mediumseagreen')
plt.title('Emotional sentiment class distribution', fontsize=16)
plt.ylabel('Class Counts', fontsize=16)
plt.xlabel('Class Label', fontsize=16)
plt.xticks(rotation='vertical');

In [None]:
label_mapping = {'NEGATIVE': 0, 'NEUTRAL': 1, 'POSITIVE': 2}

In [None]:
def preprocess_inputs(df):
    df = df.copy()
    
    df['label'] = df['label'].replace(label_mapping)
    
    y = df['label'].copy()
    X = df.drop('label', axis=1).copy()
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=123)
    
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = preprocess_inputs(brainwave_df)

In [None]:
X_test.head()

In [None]:
# pipelines = {
#     "SVM": Pipeline([
#         ('scaler', StandardScaler()),
#         ('classifier', LinearSVC(max_iter=5000))
#     ]),
#     "Random Forest": Pipeline([
#         ('scaler', StandardScaler()),
#         ('classifier', RandomForestClassifier())
#     ]),
#     "PCA + Logistic Regression": Pipeline([
#         ('scaler', StandardScaler()),
#         ('PCA', PCA(n_components=2)),
#         ('classifier', LogisticRegression())
#     ]),
#     "Gradient Boosting": Pipeline([
#         ('scaler', StandardScaler()),
#         ('classifier', GradientBoostingClassifier())
#     ]),
# }

# # 파이프라인을 학습시킵니다.
# for name, pipeline in pipelines.items():
#     pipeline.fit(X_train, y_train)

# # 점수를 얻습니다.
# scores = {}
# for name, pipeline in pipelines.items():
#     scores[name] = pipeline.score(X_test, y_test)

# # 점수를 출력합니다.
# for name, score in scores.items():
#     print(f"{name}: {score}")

In [None]:
# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Feature Importance with RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train_scaled, y_train)
rf_score = rf.score(X_test_scaled, y_test)

In [None]:
print("ELI5 Feature Importance")
rf_fi = eli5.explain_weights_df(rf, feature_names=X_train.columns.tolist())
print(rf_fi)

In [None]:
print("Partial Dependence Plot (PDP)")
plot_partial_dependence(rf, X_train_scaled, features=[0, 1], target=0)

In [None]:
# SHAP
shap.initjs()
print("SHAP Values")
explainer = shap.Explainer(rf, X_train_scaled)
shap_values = explainer(X_test_scaled)
ap.plots.waterfall(shap_values[0])

In [None]:
# LIME
print("LIME Explanations")
lime_explainer = lime_tabular.LimeTabularExplainer(X_train_scaled, feature_names=X_train.columns, mode='classification')
explanation = lime_explainer.explain_instance(X_test_scaled[0], rf.predict_proba)
explanation.show_in_notebook()

In [None]:
# Counterfactual Explanations
print("Counterfactual Explanations")
counterfactual = CounterFactual(rf.predict, X_train_scaled.shape[1], lam_init=1e6, max_iter=100, tol=1e-5)
counterfactual_instance = counterfactual.explain(X_test_scaled[0])
print("Original instance: ")
print("Target label: ", y_test.iloc[0])
print("Predicted label (rf): ", rf.predict([X_test_scaled[0]])[0])

In [None]:
# Print counterfactual instance
print("Counterfactual instance: ")
print("Counterfactual label (rf): ", counterfactual_instance.cf['class'])

In [None]:
# PCA
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

In [None]:
# LinearSVC
svc = LinearSVC(max_iter=5000)
svc.fit(X_train_scaled, y_train)
svc_score = svc.score(X_test_scaled, y_test)

# LinearSVC coefficients
svc_coef = svc.coef_
svc_coef_df = pd.DataFrame({'feature': X_train.columns, 'coefficient': svc_coef[0]})
print("LinearSVC coefficients:")
print(svc_coef_df)

In [None]:
# RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train_scaled, y_train)
rf_score = rf.score(X_test_scaled, y_test)

# RandomForest feature importance
rf_importances = rf.feature_importances_
rf_importance_df = pd.DataFrame({'feature': X_train.columns, 'importance': rf_importances})
print("RandomForest feature importance:")
print(rf_importance_df.sort_values('importance', ascending=False))

In [None]:
# PCA + LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train_pca, y_train)
logreg_score = logreg.score(X_test_pca, y_test)

logreg_coef_df = pd.DataFrame({'feature': ['PC1', 'PC2'], 'coefficient': logreg_coef[0]})
print("LogisticRegression coefficients:")
print(logreg_coef_df)

In [None]:
# GradientBoostingClassifier
gb = GradientBoostingClassifier()
gb.fit(X_train_scaled, y_train)
gb_score = gb.score(X_test_scaled, y_test)

# GradientBoosting feature importance
gb_importances = gb.feature_importances_
gb_importance_df = pd.DataFrame({'feature': X_train.columns, 'importance': gb_importances})
print("GradientBoosting feature importance:")
print(gb_importance_df.sort_values('importance', ascending=False))

In [None]:
# 출력
print(f"SVM: {svc_score}")
print(f"Random Forest: {rf_score}")
print(f"PCA + Logistic Regression: {logreg_score}")
print(f"Gradient Boosting: {gb_score}")

In [None]:
# SHAP values for RandomForest
explainer = shap.TreeExplainer(rf)
shap_values = explainer.shap_values(X_train)
shap.summary_plot(shap_values, X_train, plot_type="bar")