In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import shap
import matplotlib.pyplot as plt
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.metrics import roc_auc_score
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

from itertools import repeat, chain
revert_dict = lambda d: dict(chain(*[zip(val, repeat(key)) for key, val in d.items()]))
        
%matplotlib inline
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
def grouped_shap(shap_vals, features, groups):
    groupmap = revert_dict(groups)
    shap_Tdf = pd.DataFrame(shap_vals, columns=pd.Index(features, name='features')).T
    shap_Tdf['group'] = shap_Tdf.reset_index().features.map(groupmap).values
    shap_grouped = shap_Tdf.groupby('group').sum().T
    return shap_grouped

# Rain in australia

## Subpopulations

In [None]:
data=pd.read_csv('/kaggle/input/weather-dataset-rattle-package/weatherAUS.csv')
plt.figure(figsize=(10,4))
data.Date.value_counts(True).sort_index().cumsum().plot();

In [None]:
features = data.drop(['Date', 'RainTomorrow'], axis=1).columns.tolist()
cat_features = data[features].select_dtypes('object').columns.tolist()
data['target'] = (data['RainTomorrow']=='Yes').astype(int)

train = data.query("Date < '2015-01-01'").dropna(subset=['RainTomorrow'])
test  = data.query("Date > '2015-01-01'").dropna(subset=['RainTomorrow'])

clf = CatBoostClassifier(iterations=30)
clf.fit(train[features].fillna(-99), train['target'], cat_features=cat_features, verbose=False)

train_auc = roc_auc_score(train['target'], clf.predict_proba(train[features].fillna(-99))[:,1])
test_auc  = roc_auc_score(test['target'],  clf.predict_proba(test[features].fillna(-99))[:,1] )
print("Train AUC: ", train_auc)
print("Out-of-time AUC: ", test_auc)

In [None]:
from shap import TreeExplainer
exp = TreeExplainer(clf)

shap_vals = exp.shap_values(test[features].fillna(-99))
shap_df = pd.DataFrame(shap_vals, columns=pd.Index(features, name='features'))
shap.summary_plot(shap_vals, test[features])

In [None]:
preds = pd.Series(clf.predict_proba(test[features].fillna(-99))[:,1])
quintiles = pd.qcut(preds, np.linspace(0,1,5), labels=np.arange(4))

fig, ax = plt.subplots(1,4, figsize=(36, 6))
for q in range(4):
    plt.sca(ax[q])
    shap.summary_plot(shap_vals[(quintiles==q).values], 
                      test.loc[(quintiles==q).values, features], 
                      show=False, 
                      plot_size=None, 
                      color_bar=False,
                      max_display=10)
    plt.title(f"Quartile {q+1} of predictions")

In [None]:
years = [2015,2016,2017]
year = test.Date.apply(lambda s: s.split('-')[0]).astype(int)
fig, ax = plt.subplots(1,3, figsize=(28, 6))
for i, b in enumerate(years):
    plt.sca(ax[i])
    idx = (year==b).values
    shap.summary_plot(shap_vals[idx], 
                      test.loc[idx, features], 
                      show=False, 
                      plot_size=None, 
                      color_bar=False, max_display=10)
    plt.title(f"Year {b}")

In [None]:
rained_flag = (test.MinTemp > 25).values
rained_shap = shap_df[rained_flag]
rained_feats = test.loc[rained_flag, features]

shap.summary_plot(rained_shap.values, rained_feats, show=False, max_display=10)
plt.title("Shap for hot days (minTemp > 25 Celsius) ")

In [None]:
rained_flag = (test.MaxTemp < 10).values
rained_shap = shap_df[rained_flag]
rained_feats = test.loc[rained_flag, features]

shap.summary_plot(rained_shap.values, rained_feats, show=False, max_display=10)
plt.title("Shap for cold days (maxTemp < 10 Celsius)")

In [None]:
flag = shap_df.Pressure3pm.apply(lambda v: v < -0.5).values
shap_selected = shap_df[flag]
feats_selected = test.loc[flag, features]

shap.summary_plot(shap_selected.values, feats_selected, show=False, max_display=10)

In [None]:
flag = shap_df.Pressure3pm.apply(lambda v: v < -0.5).values * (test.target==1).values
shap_selected = shap_df[flag]
feats_selected = test.loc[flag, features]

shap.summary_plot(shap_selected.values, feats_selected, show=False, max_display=10)

In [None]:
flag = shap_df.Humidity3pm.apply(lambda v: v > -0.1 and v < 0.1).values
shap_selected = shap_df[flag]
feats_selected = test.loc[flag, features]

shap.summary_plot(shap_selected.values, feats_selected, show=False, max_display=10)

# UK Accidents

In [None]:
cas = pd.read_csv("/kaggle/input/dft-accident-data/Casualties0515.csv",  delimiter=',', error_bad_lines=False, warn_bad_lines=False)
veh = pd.read_csv("/kaggle/input/dft-accident-data/Vehicles0515.csv",  delimiter=',', error_bad_lines=False, warn_bad_lines=False)
acc = pd.read_csv("/kaggle/input/dft-accident-data/Accidents0515.csv", delimiter=',', error_bad_lines=False, warn_bad_lines=False)
cas['Accident_Index'] = cas['Accident_Index'].astype(str)+'g'
veh['Accident_Index'] = veh['Accident_Index'].astype(str)+'g'
acc['Accident_Index'] = acc['Accident_Index'].astype(str)+'g'
cas = cas.set_index('Accident_Index')
veh = veh.set_index('Accident_Index')
acc = acc.set_index('Accident_Index')

In [None]:
joined = (acc
.join(cas, on=["Accident_Index"], how='inner', rsuffix='cas')
.join(veh, on=["Accident_Index"], how='inner', rsuffix='veh'))

obj_cols = joined.select_dtypes('object').columns.tolist()
features = joined.columns.drop(['Vehicle_Referenceveh', 'Casualty_Severity', 'Accident_Severity']+obj_cols).tolist()
features =[f for f in features if f!= 'target']
joined['target'] = (joined.Casualty_Severity<3).astype(int)
joined['Date'] = pd.to_datetime(joined.Date)

In [None]:
train = joined.query("Date < '2012-01-01'").dropna(subset=['target'])
test  = joined.query("Date > '2012-01-01'").dropna(subset=['target'])
test['month'] = test.Date.dt.year.astype(str) + '-' + test.Date.dt.month.apply(lambda i: '%02d'%i)

clf = CatBoostClassifier(iterations=30)
clf.fit(train[features].fillna(-99), train['target'], verbose=False)

train_auc = roc_auc_score(train['target'], clf.predict_proba(train[features].fillna(-99))[:,1])
test_auc  = roc_auc_score(test['target'],  clf.predict_proba(test[features].fillna(-99))[:,1] )
print("Train AUC: ", train_auc)
print("Out-of-time AUC: ", test_auc)

In [None]:
from shap import TreeExplainer
exp = TreeExplainer(clf)
test_shap = test.sample(10000)

shap_vals = exp.shap_values(test_shap[features].fillna(-99))
shap_df = pd.DataFrame(shap_vals, columns=pd.Index(features, name='features'))
shap.summary_plot(shap_vals, test_shap[features])

In [None]:
preds = pd.Series(clf.predict_proba(test_shap[features].fillna(-99))[:,1])
quintiles = pd.qcut(preds, np.linspace(0,1,6), labels=np.arange(5))

fig, ax = plt.subplots(1,5, figsize=(42, 5))
for q in range(5):
    plt.sca(ax[q])
    shap.summary_plot(shap_vals[(quintiles==q).values], 
                      test_shap.loc[(quintiles==q).values, features].values, 
                      [f[:18] for f in features],
                      show=False, 
                      plot_size=None, 
                      color_bar=False,
                      max_display=10)
    plt.title(f"Quintile {q+1} of predictions")

In [None]:
flag = (test_shap.Casualty_Type==0).values
shap_selected = shap_df[flag]
feats_selected = test_shap.loc[flag, features]

shap.summary_plot(shap_selected.values, feats_selected, show=False, max_display=10)

In [None]:
flag = (test_shap.Casualty_Type==11).values
shap_selected = shap_df[flag]
feats_selected = test_shap.loc[flag, features]

shap.summary_plot(shap_selected.values, feats_selected, show=False, max_display=10)

In [None]:
flag = (test_shap.Casualty_Type==5).values
shap_selected = shap_df[flag]
feats_selected = test_shap.loc[flag, features]

shap.summary_plot(shap_selected.values, feats_selected, show=False, max_display=10)

In [None]:
plt.figure(dpi=150)
avg_motorcycle = shap_df.loc[(test_shap.Casualty_Type==5).values, features].mean()
avg_pedestrian = shap_df.loc[(test_shap.Casualty_Type==0).values, features].mean()
diff = (avg_motorcycle - avg_pedestrian)
diff[diff.abs().sort_values().index.drop('Casualty_Type').tolist()[-15:]].plot.barh(ax=plt.gca())
plt.xlabel("More important for pedestrians | More important for motorcycle")
plt.xlim(-.17,.17)
plt.hlines(np.arange(15)//2*2+1, -.17,.17, alpha=.06, linewidth=13, color='b')
sns.despine(left=True)
plt.ylabel('')

In [None]:
flag = (shap_df.Number_of_Casualties>0.5).values
shap_selected = shap_df[flag]
feats_selected = test_shap.loc[flag, features]

shap.summary_plot(shap_selected.values, feats_selected, show=False, max_display=10)

In [None]:
flag = (shap_df.Vehicle_Manoeuvre>0.3).values
shap_selected = shap_df[flag]
feats_selected = test_shap.loc[flag, features]

shap.summary_plot(shap_selected.values, feats_selected, show=False, max_display=10)

In [None]:
test['prediction'] = clf.predict_proba(test[features].fillna(-99))[:,1]

In [None]:
plt.figure(dpi=120)
test.groupby('month').prediction.mean().sort_index().plot(label='predicted deaths', color='r')
# test.groupby('month').target.mean().sort_index().plot(label='true', color='k', linestyle='--')
plt.legend(frameon=False)
sns.despine()
plt.gca().spines['left'].set_bounds(0.105, 0.135)
plt.gca().spines['bottom'].set_bounds(0,35)
plt.xticks(rotation=15)

In [None]:
plt.figure(dpi=120)
ax=plt.gca()
test.groupby('month').prediction.mean().sort_index().plot(label='predicted deaths', color='r')
test.groupby('month').target.mean().sort_index().plot(label='deaths', color='k', linestyle='--', linewidth=.6)
ratio = (test.groupby('month').prediction.mean().sort_index()/test.groupby('month').target.mean().sort_index())*100
plt.legend(frameon=False, loc='upper left')
ax.spines['left'].set_bounds(0.105, 0.135)
ax.spines['bottom'].set_bounds(0,35)

tax=plt.gca().twinx()
ratio.plot(ax=tax, label='ratio')
tax.set_yticks([int(ratio.min()), 100, int(ratio.max())])
tax.hlines([int(ratio.min()), 100, int(ratio.max())], 0, 35, linewidth=0.5, linestyles='dotted')
tax.set_ylim(50, 120)
plt.legend(frameon=False)

sns.despine(right=False)

tax.spines['right'].set_bounds(int(ratio.min()), int(ratio.max()))
ax.spines['right'].set_bounds(50,50)
tax.spines['left'].set_bounds(100,100)
tax.spines['bottom'].set_bounds(0,35)

plt.xticks(rotation=15)

In [None]:
test.groupby('month').prediction.mean().sort_index()

In [None]:
bef = shap_df[(test_shap["month"] == '2013-04').values].mean()
at = shap_df[(test_shap["month"] == '2013-05').values].mean()

diff = at - bef

In [None]:
plt.figure(dpi=150)
diff[diff.abs().sort_values().index.drop('Casualty_Type').tolist()[-15:]].plot.barh(ax=plt.gca())
plt.xlabel("Lesser risk at event | Higher risk at event")
plt.xlim(-.17,.17)
plt.hlines(np.arange(15)//2*2+1, -.17,.17, alpha=.06, linewidth=13, color='b')
sns.despine(left=True)
plt.ylabel('')

In [None]:
test.query("month == '2013-05'").Number_of_Casualties.plot.hist()

In [None]:
test.query("month == '2013-05' and Number_of_Casualties>50")

In [None]:
test_changed = test.copy()
cas = test_changed.query("month == '2013-04'").Number_of_Casualties.mean()
veh = test_changed.query("month == '2013-04'").Number_of_Vehicles.mean()
test_changed['Number_of_Casualties'] = test_changed.apply(lambda row: cas if row.month=='2013-05' else row.Number_of_Casualties, axis=1)
test_changed['Number_of_Vehicles'] = test_changed.apply(lambda row: veh if row.month=='2013-05' else row.Number_of_Vehicles, axis=1)

In [None]:
test_changed['prediction'] = clf.predict_proba(test_changed[features].fillna(-99))[:,1]

In [None]:
plt.figure(dpi=120)
test_changed.groupby('month').prediction.mean().sort_index().plot(label='predicted deaths after fixing variables', color='purple')
test.groupby('month').prediction.mean().sort_index().plot(label='predicted deaths', color='r', linestyle='--', linewidth=.9)
# test.groupby('month').target.mean().sort_index().plot(label='true', color='k', linestyle='--')
plt.legend(frameon=False, loc='upper left')
sns.despine()
plt.gca().spines['left'].set_bounds(0.105, 0.135)
plt.gca().spines['bottom'].set_bounds(0,35)
plt.xticks(rotation=15);