In [None]:
import pandas as pd 
import os
from catboost import CatBoostClassifier, Pool

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import shap
shap.initjs()

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv("/kaggle/input/air-marshal-misconduct/FederalAirMarshalMisconduct.csv",parse_dates=["Date Case Opened"])
display(df)

#### EDA
* Analyze input data & output/`target` frequencies
* Drop "Final Disposition" column (leak).
* Keep subset of target column (due to cardinality) 
* We could also strip the whitespace from column names if handling further

In [None]:
df.isna().sum()

In [None]:
print("% duplicate rows", (100*(df.shape[0] - df.drop_duplicates().shape[0])/df.shape[0]))
## note: dropping Final Disposition results in more duplicates

In [None]:
df["Date Case Opened"].describe(datetime_is_numeric=True)

In [None]:
for c in df.columns:
    print(c)
    print("# unique values/cardinality",df[c].nunique())
    print(df[c].value_counts())

* `Field Office` : a (very few) misspellings (lAX vs LAX). We'll clean by uppercasing all. 
    * This column could be attached to external data about the airports - how busy they are for example, how many international flights, specific carriers, hubs etc
    * We will clean the short input texts and target columns by uppercasing
* `target` = `Final Disposition` cafter cleaning (e.g. merging of Suspension 1/3/X days together, and text lowercasing). 



## Feature Engineering
* Clean text
* Add basic calendar features
    * Much more can be done - adding external data, checking holidays, time-series/history of office (nvm individual in question) etc'

In [None]:
df["Field Office "] = df["Field Office "].str.upper()
df["Field Office "] = df["Field Office "].fillna("") # fill missing values
print("Field Office unique values after casing",df["Field Office "].nunique())
df['Allegation '] = df['Allegation '].str.lower()
print("Allegation unique values after casing",df["Allegation "].nunique())

In [None]:
df["year"] = df["Date Case Opened"].dt.year
df["month"] = df["Date Case Opened"].dt.month
df["week"] = df["Date Case Opened"].dt.isocalendar().week.astype(int) # replaces dt.week

* Some simple feature/target analysis

* TODO: Join and divide by background target frequency, to see change in target freq, given feature/time. 
* apply this to other features for EDA

In [None]:
target_freq_background = df["target"].value_counts(normalize=True).round(4)
target_freq_background

In [None]:
df.groupby("year")["target"].value_counts(normalize=True)
##TODO: Join and divide by background target frequency, to see change in target freq, given feature/time. 
## apply this to other features for EDA

In [None]:
df

## Model & Target
* Drop rarest target outcomes - keep those with at least 20 outcomes cases
* Simple catboost model as a baseline + Shap for explanations
    * Use pool and define categorical columns for catboost

In [None]:
## code from: https://stackoverflow.com/questions/29836836/how-do-i-filter-a-pandas-dataframe-based-on-value-counts    
df = df.loc[df['target'].map(df['target'].value_counts()) > 20]

## drop original datecol and the "raw" outcomes col
df.drop(["Date Case Opened","Final Disposition"],axis=1,inplace=True)
print(df.shape)

In [None]:
print("% duplicate rows", (100*(df.shape[0] - df.drop_duplicates().shape[0])/df.shape[0]))
print("# rows left if we drop duplicates:",df.drop_duplicates().shape[0])
print("missing vals \n",df.isna().sum())


# df = df.drop_duplicates()

* Dropping "genuine" duplicates isn't ideal as we lose EDA info and recurring complaints that may be easy to classify.

In [None]:
X = df.drop("target",axis=1)
y= df["target"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42) # the "stop word" of ML code ;)

In [None]:
## Identufy categorical columns in features:  (We coyuld also use textCols, but then we lose interpretability with current catbosot)
## good practice for cases with lots of cols: 
CAT_COLS = list(X.select_dtypes(include="O").columns)
CAT_COLS

In [None]:
model = CatBoostClassifier(
#         iterations=600,
        loss_function='MultiClass',
        eval_metric='AUC',         
#         task_type="GPU",
        verbose=False)

# we could improve model with a validation split or CV to avoid overfitting
model.fit(
        X_train, y_train, plot=True,
        cat_features=CAT_COLS,       
#     cat_features=['Field Office '],text_features= ['Allegation '] # catboost doesn't yet support feature importance for text it's own text features. We can do without here
);

In [None]:
y_pred = model.predict(X_test)
print(classification_report(y_true=y_test, y_pred=y_pred))

## Features importance
* Use SHAP for now - score is importance of each feature, given all other features in model. 
* SHAP doesn't seem to include anything "easy" for getting the class names

In [None]:
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(Pool(X, y,
#                                          cat_features=['Field Office '],text_features= ['Allegation ']
                                         cat_features=CAT_COLS
                                        ))

In [None]:
# feature importance plot overall
shap.summary_plot(shap_values, X)

We can also get a shap plot of features importance per class (i.e a "binary" view):


In [None]:
list(y.unique())
## hopefully shap picked classes in order the yappeared in data? 

In [None]:
## class "7" - retirement?
shap.summary_plot(shap_values[6], X)

In [None]:
## class "3" - resignation?
shap.summary_plot(shap_values[3], X)

* We see that resignations/class 3 are much less likely to occur in later years!
* Lets plot the total amount of resignations by year 

In [None]:
df2 = df[["target","year"]].copy()
df2["is_target"] = (df2["target"]=="suspension").astype(int)
print("total suspension by year:")
df2.groupby("year").sum().plot();

In [None]:
print("fraction of complaints handled as suspension by year:");
df2.groupby("year").mean().plot()

In [None]:
## class "5" -removal ?
shap.summary_plot(shap_values[5], X)

In [None]:
## class "2" -  letter of counsel
shap.summary_plot(shap_values[2], X)

In [None]:
## class "0" - verbal counsel
shap.summary_plot(shap_values[0], X)

In [None]:
## class "1" - suspension ?
shap.summary_plot(shap_values[1], X)

In [None]:
## class "6" - no further action?
shap.summary_plot(shap_values[6], X)