In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import shap
import matplotlib.pyplot as plt
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.metrics import roc_auc_score
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

from itertools import repeat, chain
revert_dict = lambda d: dict(chain(*[zip(val, repeat(key)) for key, val in d.items()]))
        
%matplotlib inline
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
def grouped_shap(shap_vals, features, groups):
    groupmap = revert_dict(groups)
    shap_Tdf = pd.DataFrame(shap_vals, columns=pd.Index(features, name='features')).T
    shap_Tdf['group'] = shap_Tdf.reset_index().features.map(groupmap).values
    shap_grouped = shap_Tdf.groupby('group').sum().T
    return shap_grouped

In [None]:
data=pd.read_csv('/kaggle/input/weather-dataset-rattle-package/weatherAUS.csv')
plt.figure(figsize=(10,4))
data.Date.value_counts(True).sort_index().cumsum().plot();

In [None]:
features = data.drop(['Date', 'RainTomorrow'], axis=1).columns.tolist()
cat_features = data[features].select_dtypes('object').columns.tolist()
data['target'] = (data['RainTomorrow']=='Yes').astype(int)

train = data.query("Date < '2015-01-01'").dropna(subset=['RainTomorrow'])
test  = data.query("Date > '2015-01-01'").dropna(subset=['RainTomorrow'])

clf = CatBoostClassifier(iterations=30)
clf.fit(train[features].fillna(-99), train['target'], cat_features=cat_features, verbose=False)

train_auc = roc_auc_score(train['target'], clf.predict_proba(train[features].fillna(-99))[:,1])
test_auc  = roc_auc_score(test['target'],  clf.predict_proba(test[features].fillna(-99))[:,1] )
print("Train AUC: ", train_auc)
print("Out-of-time AUC: ", test_auc)

In [None]:
from shap import TreeExplainer
exp = TreeExplainer(clf)

shap_vals = exp.shap_values(test[features].fillna(-99))
shap_df = pd.DataFrame(shap_vals, columns=pd.Index(features, name='features'))
shap.summary_plot(shap_vals, test[features])

In [None]:
shap_vals = exp.shap_values(train[features].fillna(-99))
shap_df = pd.DataFrame(shap_vals, columns=pd.Index(features, name='features'))
shap.summary_plot(shap_vals, train[features])

In [None]:

groups_by_time = {
    '3pm': [f for f in features if '3pm' in f],
    '9am': [f for f in features if '9am' in f],
    'not_time_based': [f for f in features if '9am' not in f and '3pm' not in f]
}

groups_by_type = {
    'humidity_and_rain': ['Rainfall',
                          'Evaporation',
                          'Humidity9am',
                          'Humidity3pm',
                          'RainToday'],
    'temperature': ['MinTemp',
                    'MaxTemp',
                    'Temp9am',
                    'Temp3pm'],
    'sun_and_clouds': ['Cloud9am',
                       'Cloud3pm',
                       'Sunshine'],
    'wind_and_pressure': ['WindGustDir',
                          'WindGustSpeed',
                          'WindDir9am',
                          'WindDir3pm',
                          'WindSpeed9am',
                          'WindSpeed3pm',
                          'Pressure9am',
                          'Pressure3pm'],
    'location': ['Location']
}


maptime = revert_dict(groups_by_time)
maptype = revert_dict(groups_by_type)

In [None]:
shap_time = grouped_shap(shap_vals, features, groups_by_time)
shap_type = grouped_shap(shap_vals, features, groups_by_type)

In [None]:
shap.summary_plot(shap_time.values, features=shap_time.columns)

In [None]:
shap.summary_plot(shap_type.values, features=shap_type.columns)

In [None]:
preds = pd.Series(clf.predict_proba(train[features].fillna(-99))[:,1])

In [None]:
quintiles = pd.qcut(preds, np.linspace(0,1,6), labels=np.arange(5))

In [None]:
fig, ax = plt.subplots(1,5, figsize=(36, 7))
for q in range(5):
    plt.sca(ax[q])
    shap.summary_plot(shap_vals[(quintiles==q).values], 
                      train.loc[(quintiles==q).values, features], 
                      show=False, 
                      plot_size=None, 
                      color_bar=False,
                      max_display=6)

In [None]:
fig, ax = plt.subplots(1,5, figsize=(36, 6))
for q in range(5):
    plt.sca(ax[q])
    shap.summary_plot(shap_vals[(quintiles==q).values], 
                      train.loc[(quintiles==q).values, features], 
                      show=False, 
                      plot_size=None, 
                      color_bar=False,
                      max_display=6)
    plt.title(f"Quintile {q} of predictions")

In [None]:
biens = np.arange(3)*2+2009
year = train.Date.apply(lambda s: s.split('-')[0]).astype(int)
fig, ax = plt.subplots(1,3, figsize=(36, 10))
for i, b in enumerate(biens):
    plt.sca(ax[i])
    idx = (year==b).values
    shap.summary_plot(shap_vals[idx], 
                      train.loc[idx, features], 
                      show=False, 
                      plot_size=None, 
                      color_bar=False)
    plt.title(f"Year {b}")

In [None]:
train.MaxTemp.plot.hist(bins=100)

In [None]:
train.Humidity3pm.plot.hist(bins=100)

In [None]:
rained_flag = (train.MinTemp > 25).values
rained_shap = shap_df[rained_flag]
rained_feats = train.loc[rained_flag, features]

shap.summary_plot(rained_shap.values, rained_feats, show=False)
plt.title("Shap for hot days (minTemp > 25 Celsius) ")

In [None]:
rained_flag = (train.MaxTemp < 10).values
rained_shap = shap_df[rained_flag]
rained_feats = train.loc[rained_flag, features]

shap.summary_plot(rained_shap.values, rained_feats, show=False)
plt.title("Shap for cold days (maxTemp < 10 Celsius)")

In [None]:
plt.figure(figsize=(9,9))
feat_order = shap_df.abs().mean().sort_values().index.drop("WindGustDir").tolist()[::-1]
sns.heatmap(shap_df.corr().abs().loc[feat_order, feat_order], cbar=False)

In [None]:
import seaborn as sns
sns.clustermap(shap_df.drop("WindGustDir", axis=1).corr().abs())

# Testing uk accidents data

In [None]:
import warnings
cas = pd.read_csv("/kaggle/input/dft-accident-data/Casualties0515.csv",  delimiter=',', error_bad_lines=False, warn_bad_lines=False)
veh = pd.read_csv("/kaggle/input/dft-accident-data/Vehicles0515.csv",  delimiter=',', error_bad_lines=False, warn_bad_lines=False)
acc = pd.read_csv("/kaggle/input/dft-accident-data/Accidents0515.csv", delimiter=',', error_bad_lines=False, warn_bad_lines=False)
cas['Accident_Index'] = cas['Accident_Index'].astype(str)+'g'
veh['Accident_Index'] = veh['Accident_Index'].astype(str)+'g'
acc['Accident_Index'] = acc['Accident_Index'].astype(str)+'g'
cas = cas.set_index('Accident_Index')
veh = veh.set_index('Accident_Index')
acc = acc.set_index('Accident_Index')

In [None]:
joined = (
acc
.join(cas, on=["Accident_Index"], how='inner', rsuffix='cas')
.join(veh, on=["Accident_Index"], how='inner', rsuffix='veh'))

In [None]:
obj_cols = joined.select_dtypes('object').columns.tolist()
features = joined.columns.drop(['Vehicle_Referenceveh', 'Casualty_Severity', 'Accident_Severity']+obj_cols).tolist()
features =[f for f in features if f!= 'target']
joined['target'] = (joined.Casualty_Severity<3).astype(int)
joined['Date'] = pd.to_datetime(joined.Date)

In [None]:
groups = {
    'geografical': [ 
        'Location_Easting_OSGR',
        'Location_Northing_OSGR',
        'Longitude',
        'Latitude',
        'Junction_Location',
        'Urban_or_Rural_Area'],
    
    'road_specs': [
        '1st_Road_Class',
        '1st_Road_Number',
        'Junction_Detail',
        'Junction_Control',
        '2nd_Road_Class',
        '2nd_Road_Number',
        'Road_Type',
        'Speed_limit',
        'Police_Force',
        'Local_Authority_(District)'],
    
    'accident': [   
        'Number_of_Vehicles',
        'Number_of_Casualties',
        'Pedestrian_Crossing-Human_Control',
        'Pedestrian_Crossing-Physical_Facilities',
        'Towing_and_Articulation',
        'Vehicle_Manoeuvre',
        'Vehicle_Location-Restricted_Lane',
        'Skidding_and_Overturning',
        'Hit_Object_in_Carriageway',
        'Vehicle_Leaving_Carriageway',
        'Hit_Object_off_Carriageway',
        '1st_Point_of_Impact',
        'Carriageway_Hazards',
        'Casualty_Reference',
        'Casualty_Type',
        'Did_Police_Officer_Attend_Scene_of_Accident'],
    
    'conditions': [
         'Light_Conditions',
         'Weather_Conditions',
         'Road_Surface_Conditions',
         'Special_Conditions_at_Site',
         'Day_of_Week'],
    
    'victim_specs': [
         'Casualty_Class',
         'Sex_of_Casualty',
         'Age_of_Casualty',
         'Age_Band_of_Casualty',
         'Pedestrian_Location',
         'Pedestrian_Movement',
         'Car_Passenger',
         'Bus_or_Coach_Passenger',
         'Pedestrian_Road_Maintenance_Worker',
         'Casualty_Home_Area_Type'],
    
    'driver_specs': [
        'Journey_Purpose_of_Driver',
        'Sex_of_Driver',
        'Age_of_Driver',
        'Age_Band_of_Driver',
        'Driver_IMD_Decile',
        'Driver_Home_Area_Type'],
    
    'vehicle_specs': [ 
        'Vehicle_Type',
        'Was_Vehicle_Left_Hand_Drive?',
        'Engine_Capacity_(CC)',
        'Propulsion_Code',
        'Age_of_Vehicle',
        'Vehicle_Reference'],

}

In [None]:
joined.Date.dt.year.value_counts().sort_index()

In [None]:
train = joined.query("Date < '2012-01-01'").dropna(subset=['target'])
test  = joined.query("Date > '2012-01-01'").dropna(subset=['target'])

clf = CatBoostClassifier(iterations=30)
clf.fit(train[features].fillna(-99), train['target'], verbose=False)

train_auc = roc_auc_score(train['target'], clf.predict_proba(train[features].fillna(-99))[:,1])
test_auc  = roc_auc_score(test['target'],  clf.predict_proba(test[features].fillna(-99))[:,1] )
print("Train AUC: ", train_auc)
print("Out-of-time AUC: ", test_auc)

In [None]:
from shap import TreeExplainer
exp = TreeExplainer(clf)
test_shap = test.sample(10000)

shap_vals = exp.shap_values(test_shap[features].fillna(-99))
shap_df = pd.DataFrame(shap_vals, columns=pd.Index(features, name='features'))
shap.summary_plot(shap_vals, test_shap[features])

In [None]:
shap_grouped = grouped_shap(shap_vals, features, groups)
shap.summary_plot(shap_grouped.values, feature_names = shap_grouped.columns)

In [None]:
groups = {
    'geografical': [ 
        'Location_Easting_OSGR',
        'Location_Northing_OSGR',
        'Longitude',
        'Latitude',
        'Junction_Location',
        'Urban_or_Rural_Area'],
    
    'road_specs': [
        '1st_Road_Class',
        '1st_Road_Number',
        'Junction_Detail',
        'Junction_Control',
        '2nd_Road_Class',
        '2nd_Road_Number',
        'Road_Type',
        'Speed_limit',
        'Police_Force',
        'Local_Authority_(District)'],
    
    'accident_before': [   
        'Pedestrian_Crossing-Human_Control',
        'Pedestrian_Crossing-Physical_Facilities',
        'Towing_and_Articulation',
        'Vehicle_Manoeuvre',
        'Vehicle_Location-Restricted_Lane',
        'Vehicle_Leaving_Carriageway'],
    
    'accident_during': [
        'Number_of_Vehicles',
        'Number_of_Casualties',
        'Hit_Object_in_Carriageway',
        'Hit_Object_off_Carriageway',
        'Skidding_and_Overturning',
        '1st_Point_of_Impact'],
    
    'accident_after': [
        'Carriageway_Hazards',
        'Casualty_Reference',
        'Casualty_Type',
        'Did_Police_Officer_Attend_Scene_of_Accident'],
    
    'conditions': [
         'Light_Conditions',
         'Weather_Conditions',
         'Road_Surface_Conditions',
         'Special_Conditions_at_Site',
         'Day_of_Week'],
    
    'victim_specs': [
         'Casualty_Class',
         'Sex_of_Casualty',
         'Age_of_Casualty',
         'Age_Band_of_Casualty',
         'Pedestrian_Location',
         'Pedestrian_Movement',
         'Car_Passenger',
         'Bus_or_Coach_Passenger',
         'Pedestrian_Road_Maintenance_Worker',
         'Casualty_Home_Area_Type'],
    
    'driver_specs': [
        'Journey_Purpose_of_Driver',
        'Sex_of_Driver',
        'Age_of_Driver',
        'Age_Band_of_Driver',
        'Driver_IMD_Decile',
        'Driver_Home_Area_Type'],
    
    'vehicle_specs': [ 
        'Vehicle_Type',
        'Was_Vehicle_Left_Hand_Drive?',
        'Engine_Capacity_(CC)',
        'Propulsion_Code',
        'Age_of_Vehicle',
        'Vehicle_Reference']
}
shap_grouped = grouped_shap(shap_vals, features, groups)
shap.summary_plot(shap_grouped.values, feature_names = shap_grouped.columns)

In [None]:
unimportant = shap_df.abs().mean()< 1e-3
unimportant_feats = unimportant[unimportant.values].index.tolist()

In [None]:
import seaborn as sns
plt.figure(figsize=(9,7), dpi=200)
sns.heatmap(shap_df.drop(unimportant_feats, axis=1).corr(method='spearman').abs())

In [None]:
abcorr = shap_df.drop(unimportant_feats, axis=1).corr(method='spearman').abs()
sns.clustermap(abcorr, figsize=(13,13))

In [None]:
highestcorrs = (abcorr**2).sum().sort_values()[-20:].index.tolist()
sns.clustermap(abcorr.loc[highestcorrs, highestcorrs], figsize=(8,8))

In [None]:
highestcorrs = (abcorr.replace(1, 0)).max().sort_values()[-20:].index.tolist()
sns.clustermap(abcorr.loc[highestcorrs, highestcorrs], figsize=(8,8))

In [None]:
len(features)

In [None]:
important = shap_df.abs().mean().sort_values()[-20:].index.tolist()
sns.clustermap(abcorr.loc[important, important], figsize=(10,10))