# **Linear shap**

In [None]:
import shap
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression


url = "/content/redwine.csv"
data = pd.read_csv(url)

data.head()

In [None]:
X = data.drop(columns=["quality"])
y = data["quality"]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


model = LinearRegression()
model.fit(X_train, y_train)



In [None]:
explainer = shap.LinearExplainer(model, X_train, feature_perturbation="correlation_dependent") # Fixed error

shap_values = explainer.shap_values(X_test)

shap_df = pd.DataFrame(shap_values, columns=X.columns)

shap.summary_plot(shap_values, X_test, feature_names=X.columns)

shap.dependence_plot("alcohol", shap_values, X_test, feature_names=X.columns)

shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[0], X_test[0], feature_names=X.columns)

# **Regression shap**

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import shap


url = "/content/redwine.csv"
data = pd.read_csv(url)

data.head()

In [None]:
sns.displot(
    data=data.isna().melt(value_name="missing"),
    y="variable",
    hue="missing",
    multiple="fill",
    aspect=1.5
)
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
features = data.drop(columns=['quality'])
labels = data['quality']
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=123)

from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=2000, max_depth=30, random_state=123)
model.fit(x_train, y_train)


In [None]:
model.score(x_test, y_test)

In [None]:
explainer = shap.Explainer(model)
shap_values = explainer(x_test)
plt.title('Feature Importance using SHAP')
shap.plots.bar(shap_values, show=True, max_display=12)

In [None]:
expected_value = explainer.expected_value
shap_values = explainer.shap_values(x_test)[0]
shap.decision_plot(expected_value, shap_values, x_test)

# **Tree Shap - German dataset**

In [None]:
!pip install --upgrade numpy pandas matplotlib seaborn sklearn lightgbm shap

In [None]:
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,roc_auc_score
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import shap

print(f"Shap version used: {shap.__version__}")

shap.initjs()

In [None]:
data = pd.read_csv('/content/german_credit_data.csv', index_col=0)
data.head()

In [None]:
data.shape

In [None]:
data.columns

In [None]:
data.info()

In [None]:
num_features = ['Age', 'Credit amount', 'Duration']
cat_features = ['Sex', 'Job', 'Housing', 'Saving accounts', 'Checking account', 'Purpose']

data[num_features].describe()

In [None]:
sns.displot(
    data=data.isna().melt(value_name="missing"),
    y="variable",
    hue="missing",
    multiple="fill",
    aspect=1.5,
    palette='seismic'
)
plt.show()

In [None]:
missing_features = ['Saving accounts','Checking account']
data[missing_features].isna().sum()/1000*100

In [None]:
data.fillna('Unknown', inplace=True)

print(data[missing_features].isna().sum()/1000 * 100)
print(data[missing_features[0]].value_counts())
print(data[missing_features[1]].value_counts())

In [None]:
data.duplicated().any()

In [None]:
le = LabelEncoder()
for feat in ['Sex','Saving accounts','Checking account','Purpose','Risk','Housing']:
    le.fit(data[feat])
    data[feat]=le.transform(data[feat])
classes = list(le.classes_)
print(classes)
data.head()

In [None]:
features = data.drop(columns=['Risk'])
labels = data['Risk']

x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=123)

x_train.columns


In [None]:
data_train = lgb.Dataset(x_train, label=y_train, categorical_feature=['Sex', 'Job', 'Housing', 'Saving accounts', 'Checking account', 'Purpose'])
data_test = lgb.Dataset(x_test, label=y_test, categorical_feature=['Sex', 'Job', 'Housing', 'Saving accounts', 'Checking account', 'Purpose'])

params = {
    'boosting_type':'gbdt',
    'objective':'binary',
    'metric':'auc',
    'num_leaves':20,
    'learning_rate':0.05,
    'feature_fraction':0.9,
    'bagging_fraction':0.8,
    'bagging_freq':5,
    'verbose':-1,
    'lambda_l1':1,
    'lambda_l2':1,
    'seed':123
}

model = lgb.train(
params,
data_train,
num_boost_round=100,
valid_sets=[data_test,data_train]
)

In [None]:
y_pred = model.predict(x_test)
y_pred = [1 if y > 0.5 else 0 for y in y_pred]

print(f'Accuracy for the baseline model is: {accuracy_score(y_test, y_pred)}')


In [None]:
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(features)

In [None]:
shap.summary_plot(shap_values, features)

In [None]:
shap.force_plot(explainer.expected_value, shap_values[0], features.iloc[0,:]) # Corrected index to 0

In [None]:
shap.decision_plot(explainer.expected_value, shap_values[0], features.iloc[0,:])

In [None]:
for col in ['Sex','Housing','Checking account','Saving accounts','Purpose','Credit amount','Age']:
    print(f'Feature Dependence plot for:{col}')
    shap.dependence_plot(col, shap_values, features, display_features=features)

# **Deep shap**

In [None]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.datasets import mnist
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten

(x_train, y_train), (x_test, y_test) = mnist.load_data()

x_train, x_test = x_train / 255.0, x_test / 255.0

model = Sequential([
    Flatten(input_shape=(28, 28)),
    Dense(128, activation='relu'),
    Dense(10, activation='softmax')
])

In [None]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(x_train, y_train, epochs=3, batch_size=128, validation_data=(x_test, y_test))
background = x_train[np.random.choice(x_train.shape[0], 100, replace=False)]

explainer = shap.DeepExplainer(model, background)
X_test_sample = x_test[:10]
shap_values = explainer.shap_values(X_test_sample)

plt.figure(figsize=(8, 4))
shap.image_plot(shap_values, X_test_sample)

# **Kernal shap**

In [None]:
!pip install shap scikit-learn matplotlib

In [None]:
import shap
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

In [None]:
data = load_iris()
x = data.data
y = data.target
feature_names = data.feature_names

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
model = RandomForestClassifier(random_state=42)
model.fit(x_train,y_train)

In [None]:
explainer = shap.KernelExplainer(model.predict, x_train)

shap_values = explainer.shap_values(x_test[:5])

shap.summary_plot(shap_values, x_test[:5], feature_names=feature_names)

In [None]:
shap.initjs()

In [None]:
#shap.force_plot(explainer.expected_value, shap_values[0][0], x_test[0], feature_names=feature_names)
