# 2021 Kaggle Survey Analysis: A SHAP interpretation
## Team Members: 

## 0. Libries and tools

In [None]:
# install libraries
# ! pip install shap
# ! pip install kaggle
# ! pip install tensorflow-data-validation
# ! pip install -i https://pypi-nightly.tensorflow.org/simple tensorflow-data-validation

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer

import shap

# import tensorflow as tf
# import tensorflow_data_validation as tfdv

## 1. Fundamental Descriptive Analysis

In [None]:
df = pd.read_csv('../input/kaggle-survey-2021/kaggle_survey_2021_responses.csv')
df = df.iloc[1: , :]
print(df.shape)

In [None]:
# def chop_question(s):
#     return s.split('-')[0]
# df.iloc[0 , :].apply(chop_question).unique().tolist()

In [None]:
# df.head()

In [None]:
# df.columns.to_list()
group_by_question(4)

In [None]:
def group_by_question(q_num, q_letter = ''):
    question = 'Q' + str(q_num) + '_' + q_letter
    result = []
    for col in df.columns:
        if col.startswith(question) or col == 'Q' + str(q_num):
            result.append(col)
    return df[result]

def value_counts_by_question(q_num):
    question_df = group_by_question(q_num)
    for col in question_df.columns:
        print(question_df[col].value_counts(dropna = False))
        
def col_prefix(col):
    if 'P' in col:
        return col.split('P')[0].strip('_')
    if 'O' in col:
        return col.split('O')[0].strip('_')
    return col

def one_hot_like_questions(df):
    cols = df.columns.to_series().apply(col_prefix).value_counts()
    return cols[cols > 1].index

def title_to_num(s):
    if '_' in s:
        s = s.split('_')[0]
    return int(s.strip("Q"))

def one_hot_formating(series):
    pos = series.value_counts().index[0]
    new_series = series.fillna(0)
    one_hot_dict = {pos: 1, 0: 0}
    return new_series.map(one_hot_dict)

def one_hot_formating_entry(s):
    if pd.notna(s):
        return 1
    return 0

def salary_range_to_mean(s):
    if pd.isna(s):
        return s
    range_list = s.strip('$>').replace(',', '').split('-')
    range_list = [int(x) for x in range_list]
    if len(range_list) == 1:
        return range_list[0]
    else:
        return np.mean(range_list)
    return range_list

def is_one_hot_like(s):
#     return any([s.startswith(pre) for pre in one_hot_like])
    return 'Part' in s or 'OTHER' in s

In [None]:
df.columns[df.columns.map(is_one_hot_like)]

In [None]:
group_by_question(25).iloc[:, 0].map(salary_range_to_mean).value_counts().to_frame().sort_index()

In [None]:
# value_counts_by_question(16)

In [None]:
# df.columns.to_series().apply(col_prefix).value_counts().sort_index()

In [None]:
one_hot_like = one_hot_like_questions(df)
# for col in df.columns:
#     if not col in one_hot_like:
#         print(df[col].value_counts())
one_hot_like

In [None]:
q7 = group_by_question(7).iloc[:, 0]
new_q7 = one_hot_formating(group_by_question(7).iloc[:, 0])
print(q7.value_counts(dropna = False))
print(new_q7.value_counts(dropna = False))

## 2. Machine Learning Models

### 2.1 Pipeline Building

#### 2.1.1 Data Transformers

In [None]:
full_one_hot_encoder = OneHotEncoder(handle_unknown="ignore")

semi_one_hot_encoder = FunctionTransformer(one_hot_formating, validate=True)
# semi_one_hot_encoder.fit_transform(q7)


#### 2.1.2 Distinguish Columns

In [None]:
categorical_columns = df.columns[~df.columns.to_series().map(is_one_hot_like)].to_list()[1:]
semi_one_hot_columns = df.columns[df.columns.map(is_one_hot_like)].to_list()
total_columns = categorical_columns + semi_one_hot_columns

In [None]:
full_one_hot_encoder.fit_transform(df[semi_one_hot_columns])
# df[semi_one_hot_columns]

In [None]:
semi_df = df[semi_one_hot_columns].copy()
semi_df[~semi_df.isna()] = 1
semi_df[semi_df.isna()] = 0
semi_df.head()