### Loading the modules

In [None]:
!pip install --upgrade scipy
!pip install seaborn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")
np.random.seed(5)
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier as RFC
import tensorflow as tf
tf.get_logger().setLevel(40) # suppress deprecation messages
from tensorflow.keras.layers import Dense, Input, Embedding, Concatenate, Reshape, Dropout, Lambda
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical

### Loading the data

In [None]:
data  = pd.read_csv('/content/Titanic-Dataset.csv')
data.head()

In [None]:
data.shape

In [None]:
data.hist(layout = (2,5), figsize=(15,8), color = 'r')
print('Data Distribution')

In [None]:
import seaborn as sns

print('This looks like a fairly imbalanced dataset')
sns.countplot(x="Survived", data=data, palette="bwr")
plt.show()

In [None]:
data['Survived'].value_counts()

In [None]:
print('Percentage of data belonging to class 1 is',int((268/768)*100))
print('Percentage of data belonging to class 0 is',int((500/768)*100))

### Null Check

In [None]:
data.isnull().sum()

### Duplication Check

In [None]:
data.duplicated().any()

### Data Description

In [None]:
data.describe()

### Data Correlation

In [None]:
numerical_data = data.select_dtypes(include=np.number)
correlation_matrix = numerical_data.corr()

    # Now you can visualize the correlation matrix using a heatmap if you want:
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.show()

### Outlier Check

In [None]:
data[(data['PassengerId'] == 0) & (data['Survived'] == 0) & (data['Pclass'] == 0)]

In [None]:
data[(data['Survived'] == 0)]

### Noise removal

In [None]:
cleaned_data = data[(data['Survived'] != 0)]
cleaned_data.shape

### Feature Engineering

In [None]:
feature_engg_data = cleaned_data.copy()
outlier_data = cleaned_data.copy()
factor = 3


columns_to_include = ['PassengerId'	,'Survived',	'Pclass','Age'	,'SibSp',	'Parch'	,'Fare']
for column in columns_to_include:
    upper_lim = feature_engg_data[column].mean () + feature_engg_data[column].std () * factor
    lower_lim = feature_engg_data[column].mean () - feature_engg_data[column].std () * factor
    feature_engg_data = feature_engg_data[(feature_engg_data[column] < upper_lim) & (feature_engg_data[column] > lower_lim)]

outlier_data = pd.concat([outlier_data, feature_engg_data]).drop_duplicates(keep=False)

print(feature_engg_data.shape)
print(outlier_data.shape)

### Normalization

In [None]:


factor = 2  # Reduced factor
def normalize_data(df):
    val = df.values
    min_max_normalizer = preprocessing.MinMaxScaler()
    norm_val = min_max_normalizer.fit_transform(val)
    df2 = pd.DataFrame(norm_val, columns=df.columns)

    if df.empty:
        print("DataFrame is empty. Skipping normalization.")
        return df
    else:
      print('not right ')

norm_feature_engg_data = normalize_data(feature_engg_data)
norm_outlier_data = normalize_data(outlier_data)

### Train-Test split

In [None]:
input_data = norm_feature_engg_data.drop(['Survived'],axis='columns')
targets =norm_feature_engg_data.filter(['Survived'],axis='columns')

x, x_test, y, y_test = train_test_split(input_data,targets,test_size=0.1,train_size=0.9, random_state=5)
x_train, x_valid, y_train, y_valid = train_test_split(x,y,test_size = 0.22,train_size =0.78, random_state=5)

In [None]:
def apply_RFC(X,y,columns):
    rfc = RFC(n_estimators=500,min_samples_leaf=round(len(X)*.01),random_state=5,n_jobs=-1)
    imp_features = rfc.fit(X,y).feature_importances_
    imp_features = pd.DataFrame(imp_features,columns=['Feature Importance'],index=columns)
    imp_features.sort_values(by=['Feature Importance'],inplace=True,ascending=False)
    imp_features['Moving Sum'] = imp_features['Feature Importance'].cumsum()
    imp_features = imp_features[imp_features['Moving Sum']<=0.95]
    top_features = imp_features.index.tolist()
    return imp_features, top_features

In [None]:
important_features, top_features = apply_RFC(x,y, data.columns.drop('Survived'))
sns.barplot(important_features['Feature Importance'], important_features.index, palette = 'tab10')
plt.title('Random Forest Feature Importance for: '+"Titanic Dataset")
plt.show()

In [None]:
# For this we need a trained model. So, let's train a model first, may be with a neural network architecture.

def model():
    '''
    Simple 3 layered Neural Network model for binary classification
    '''
    inp = Input(shape=(x_train.shape[1],))
    x = Dense(40, activation='relu')(inp)
    x = Dense(40, activation='relu')(x)
    op = Dense(2, activation='softmax')(x)
    model = Model(inputs=inp, outputs=op)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

model = model()
model.fit(x_train, to_categorical(y_train), batch_size=64, epochs=300, verbose=0)

In [None]:
# Evaluate the trained model
model.evaluate(x_test, to_categorical(y_test))[1]

Although we are not concerned about the final model accuracy, but we do have a decent model to try sensitivity analysis on. Next, we will take a query instance to perform the 6-σ (six sigma) variation rule for Sensitivity analysis on the query instance.

In [None]:
query_instance = x_test.iloc[5].values.reshape((1,) + x_test.iloc[5].shape)
print("Let's take a look at the normalized query data instance in which all the features are in the range of (0.0 - 1.0):" )
df_query = pd.DataFrame(query_instance, columns = input_data.columns)
df_query

In [None]:
predicted_outcome = np.argmax(model.predict(query_instance))
true_label = int(y_test.iloc[5][0])
print(f" The true label is : {true_label}")
print(f" The predicted outcome is : {predicted_outcome}")

We can clearly see the model is correctly predicting the presence of diabetes. Now, let's see if it changes when we are doing sensitivity analysis, one by one for all the features.

The measure for standard deviation(σ) can be calculated on the nomalized training data as we will be using the normalized data for the prediction part.

In [None]:
sigma_glucose = np.std(x['Glucose'])
sigma_bmi = np.std(x['BMI'])
sigma_age = np.std(x['Age'])
sigma_dpf = np.std(x['DiabetesPedigreeFunction'])
sigma_pregnancies = np.std(x['Pregnancies'])
sigma_insulin = np.std(x['Insulin'])
sigma_bp = np.std(x['BloodPressure'])

In [None]:
# Let's see the sensitivity analysis plots now
def sensitivity_analysis_plot(measure_tuple): #the function takes one argument measure_tuple,which has features (glucose,BMI),and std deviation
    '''
    Sensitivity Analysis plot using the 6-σ variation method
    '''
    (measure, sigma) = measure_tuple

    sensitivity_output = [] #intialize a empty list
    original_value = df_query[measure].copy() #the original value is copied here
    for k in [-3, -2, -1, 1, 2, 3]:
        df_query[measure] = original_value.copy()
        df_query[measure] = np.clip(df_query[measure] + k * sigma, 0.0, 1.0)
        sensitivity_output.append(np.argmax(model.predict(df_query.values)))#the most likely class
    plt.plot(['-3σ', '-2σ', '-σ', 'σ', '2σ', '3σ'], sensitivity_output, 'r.-', label = 'Sensitivity output')
    plt.axhline(y = predicted_outcome, color = 'b', linestyle = '--', label = 'Original Prediction')
    plt.title(f'6-σ variation sensitity plot for the feature: {measure}')
    plt.legend()
    plt.show()

measure_tuple_list = [('Glucose', sigma_glucose),
                     ('BMI', sigma_bmi),
                     ('Age', sigma_age),
                     ('DiabetesPedigreeFunction', sigma_dpf),
                     ('Pregnancies', sigma_pregnancies),
                     ('Insulin', sigma_insulin),
                     ('BloodPressure', sigma_bp)]

for measure_tuple in measure_tuple_list:
    sensitivity_analysis_plot(measure_tuple)

From the above plots, we observe how each of the features are sensitive towards positive or negative changes and how each feature contributes towards influencing the model outcome.The features about Insulin, Diabetes Pedigree Function and Number of Preganancies doesn't seem to be sensitive towards any changes. The features giving information about Glucose, BMI, Blood Pressure seems to positive influence towards the outcome. That means, if the values for these features are increased, it may lead to the presence of diabetes according to the model. SUrprisingly, the feature Age shows a negative influence, which means if the age is increased, the model is less sensitive towards predicting the outcome as diabetes. This is contradicting our prior knowledge and hence is quite an interesting observation and needs to be inspected further.

### Final Thoughts

We have seen how influence based methods like feature importance and sensitivity analysis can be applied to explain the influence of features towards the model's decision making process. But I have only show examples related to classification problem. I would strongly recommend you to try out these methods for explaining models used for regression based problems as well.

### Reference

1. Kaggle | Pima Indians Diabetes Database - https://www.kaggle.com/uciml/pima-indians-diabetes-database?select=diabetes.csv
2.  How to Calculate Feature Importance With Python | Machine Learning Mastery - https://machinelearningmastery.com/calculate-feature-importance-with-python/
3. Some of the utility functions and code are taken from the GitHub Repository of the author - Aditya Bhattacharya https://github.com/adib0073