In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt 
import seaborn as sns 
import plotly.express as px 
import plotly.graph_objects as go
import gc 
import math 
from tqdm import tqdm 
from wordcloud import WordCloud 
import scipy as sp
import nltk 
from nltk.corpus import stopwords

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import TSNE 
from sklearn.cluster import KMeans 
from sklearn.preprocessing import LabelEncoder 
from sklearn.model_selection import cross_val_score, train_test_split 
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor

pd.set_option("display.max_columns", None)

In [None]:
df_all = pd.read_csv("../input/kaggle-survey-2021/kaggle_survey_2021_responses.csv", header=1)
df_all = df_all.iloc[:, 1:]
df_all.head()

In [None]:
stats_cols = df_all.columns[:6].to_list() \
  +  ["In what industry is your current employer/contract (or your most recent employer if retired)? - Selected Choice"]  +  ["What is your current yearly compensation (approximate $USD)?"] 

df_stats = df_all[stats_cols]
df_stats = df_stats[df_stats[df_stats.columns[-1]].notna()]
df_stats = df_stats.reset_index(drop=True)
print(df_stats[df_stats.columns[-1]].value_counts())

# Visualization of annual income disparity in the group
---

In [None]:
def cln_salary(x) -> int:
    x = str(x)
    x = x.replace("$", "")
    x = x.replace(",", "")
    x = x.replace(">", "")
    
    if x.find("-") >= 0:
        return round(int(x.split("-")[0]) + int(x.split("-")[-1]) / 2) 
    else:
        return int(x)
    
fix_col = ["age", "gender", "country", "education", "role", "experience", "employer", "compensation_categorical"]
df_stats.columns = fix_col
    
df_stats["compensation_numerical"] = df_stats[df_stats.columns[-1]].apply(cln_salary).astype(int)
categorical = df_stats["compensation_categorical"].value_counts(normalize=True)
all_mean = df_stats["compensation_numerical"].mean()

fig = go.Figure(data=[
        go.Bar(name="compensation", y=categorical.values.ravel()),
])

fig.update_layout(
#         shapes=[
#         dict(
#           type= 'line',
#           yref= 'y', y0= all_mean, y1=all_mean,
#           xref= 'x', x0= -0.5, x1= len(categorical.index)-0.5
#         )],
        barmode='group',
        title='Overall distribution of annual income',
        xaxis_title='compesation',
        yaxis_title='Count',
        xaxis = dict(
            tickmode = 'array',
            tickvals = [x for x in range(len(categorical.index))],
            ticktext = categorical.index
        )
)

# fig.add_annotation(x=len(categorical.index)*0.95, y=all_mean,  xshift=-20, yshift=10,
#             text="Global Average",
#             showarrow=False)

fig.show()

In [None]:
def viz_groupby_compesation(df: pd.DataFrame, col_name: str):
    """plot bar"""
    x = df_stats.groupby(col_name)["compensation_numerical"].mean()
    
    fig = go.Figure(data=[
        go.Bar(name=f"compensation with {col_name}", y=x.values.ravel()),
    ])
    fig.update_layout(
        barmode='group',
        title=f'Overall distribution of annual income by {col_name}',
        xaxis_title=col_name,
        yaxis_title='compensation',
        xaxis = dict(
            tickmode = 'array',
            tickvals = [xx for xx in range(len(x.index))],
            ticktext = x.index
        )
    )
    fig.show()
    
    
def viz_groupby_compesation_location(df: pd.DataFrame):
    """plot location map"""
    x = df_stats.groupby("country")["compensation_numerical"].mean().to_frame()
    x["country"] = x.index 
    x.reset_index(drop=True, inplace=True)
    
    fig = px.choropleth(x, 
                    locations = 'country',  
                    color = "compensation_numerical",
                    locationmode = 'country names', 
                    color_continuous_scale = 'viridis',
                    title =  "compensation by country", 
                    range_color = [0, x["compensation_numerical"].max()])
    fig.update(layout=dict(title=dict(x=0.5)))
    fig.show()
    

## x Age 

In [None]:
viz_groupby_compesation(df_stats, "age")

## x gender 

In [None]:
viz_groupby_compesation(df_stats, "gender")

## x country 

In [None]:
viz_groupby_compesation_location(df_stats)

## x role 

In [None]:
viz_groupby_compesation(df_stats, "role")

## x education level 

In [None]:
viz_groupby_compesation(df_stats, "education")

## x employment 

In [None]:
viz_groupby_compesation(df_stats, "employer")

# Estimator by statistical test
---
Estimate the population average of average annual income between each group, using the entire raw data obtained as part of the sample data. The significance level is 95%.

In [None]:
def stats_predicter(df: pd.DataFrame, col_name: str):
    n_df = df[col_name].value_counts().to_frame().rename(columns={col_name: "values"})
    mu_df = df.groupby(col_name).mean().loc[:, ["compensation_numerical"]].rename(columns={"compensation_numerical": "mean"})
    std_df = df.groupby(col_name).std().loc[:, ["compensation_numerical"]].rename(columns={"compensation_numerical": "std"})
    
    dfs = pd.merge(n_df, mu_df, how="inner", left_index=True, right_index=True)
    dfs = pd.merge(dfs, std_df, how="inner", left_index=True, right_index=True)    
    
    predict = []
    for n, mu, std in zip(dfs["values"].to_list(), dfs["mean"].to_list(), dfs["std"].to_list()):
        x_mu_min = round(mu - 1.96 * ( std / math.sqrt(n) ), 3)
        x_mu_max = round(mu + 1.96 * ( std / math.sqrt(n) ), 3)        
        predict.append(str(x_mu_min) + " ~ " + str(x_mu_max))
        
    dfs["population_mean"] = predict 
    dfs = dfs.rename(columns={"mean": "sample_mean"})
    dfs.index.name = col_name 
    return dfs[["sample_mean", "population_mean"]]

In [None]:
stats_predicter(df_stats, "age")

# Estimate by machine learning
---

+ Prepare Dataset 

In [None]:
col_all = pd.read_csv("../input/kaggle-survey-2021/kaggle_survey_2021_responses.csv")
col_all = col_all.iloc[:1, 1:]

num2many_question = {}
for col in col_all.columns.to_list():
    if len(col.split("_")) > 1:
        num = col.split("_")[0][1:]
        if num not in num2many_question:
            num2many_question[num] = col_all[col].values[0].split("-")[0]
        else:
            continue 
    else:
        continue 
        
num2once_question = {}
for col in col_all.columns.to_list():
    if len(col.split("_")) == 1:
        num = col.split("_")[0][1:]
        if num not in num2once_question:
            num2once_question[num] = col_all[col].values[0]
        else:
            continue 
    else:
        continue 
        

In [None]:
num2once_question

In [None]:
num2many_question

In [None]:
%%time 

def take_dummies(df):
    cols = [c for _, c in num2many_question.items()]
    category_cols = []
    
    for col in cols:
        category_cols.extend([c for c in df.columns.to_list() if c.find(col) >= 0])
    return pd.get_dummies(data=df, columns=category_cols)

df_ml = df_all[df_all["What is your current yearly compensation (approximate $USD)?"].notna()].reset_index(drop=True)
# df_ml = df_ml.rename(columns={"What is your current yearly compensation (approximate $USD)?": "target"})
df_ml["target"] = df_ml["What is your current yearly compensation (approximate $USD)?"].apply(cln_salary).astype(int)

# labelencoder 
le_list = [] # I want to get the converted category later, so save it
for _, col in num2once_question.items():
    le = LabelEncoder()
    df_ml[col] = df_ml[col].fillna("none")
    df_ml[col] = le.fit_transform(df_ml[col])
    le_list.append(le.classes_)
df_ml.drop(["What is your current yearly compensation (approximate $USD)?"], axis=1, inplace=True)

# get dummies 
df_ml = take_dummies(df_ml)

# いくつかのカラムにおいて欠損値が発生したので削除した。原因不明
df_ml = df_ml.dropna(axis=1).reset_index(drop=True)
gc.collect()
df_ml.head()

* Train phase 

In [None]:
def fit_predict(df: pd.DataFrame):
    x_train, x_val = train_test_split(df, test_size=0.25, random_state=42)
    x_train, y_train = x_train.drop("target", axis=1), x_train[["target"]]
    x_val, y_val = x_val.drop("target", axis=1), x_val[["target"]]
    # metrics 
    score = cross_val_score(RandomForestRegressor(random_state=42, n_jobs=-1), df.drop("target", axis=1), df[["target"]], cv=5)
    print(score)
    print(f"CV SCORE: {np.mean(score)}")
    # train model 
    model = RandomForestRegressor(random_state=42, n_jobs=-1).fit(x_train, y_train)
    pred = model.predict(x_val).flatten()
    print(f"mae: {mean_squared_error(pred, y_val.values.ravel(), squared=False)}")
    gc.collect()
    return model 

model = fit_predict(df_ml)

# Predict ML and Statistical test
---

In [None]:
def ml_predicter(model, df_ml, col_name, number):
    classes = le_list[number]
    col_num, pred = [], []
    
    for col in df_ml[col_name].unique():
        x = df_ml[df_ml[col_name] == col].drop("target", axis=1)
        y = model.predict(x).flatten()
        y = np.mean(y)
        pred.append(y)
        col_num.append(classes[int( col-1 )])
        
    return pd.DataFrame({"ml_predict": pred}, index=col_num)

def main(df_stats, df_ml, number, model):
    stats = stats_predicter(df_stats, use_stats_cols[number])
    ml = ml_predicter(model, df_ml, use_ml_cols[number], number_list[number])
    return pd.merge(stats, ml, how="inner", left_index=True, right_index=True)
        
use_ml_cols = [
    "What is your age (# years)?", 
    "What is your gender? - Selected Choice", 
    "In which country do you currently reside?",
    "What is the highest level of formal education that you have attained or plan to attain within the next 2 years?",
    "Select the title most similar to your current role (or most recent title if retired): - Selected Choice",
    "For how many years have you been writing code and/or programming?",
    "In what industry is your current employer/contract (or your most recent employer if retired)? - Selected Choice"
]

use_stats_cols = [
    "age", 
    "gender", 
    "country",
    "education",
    "role",
    "experience",
    "employer"
]

number_list = [0, 1, 2, 3, 4, 5, 10]

### Age 

In [None]:
main(df_stats, df_ml, 0, model)

### Gender 

In [None]:
main(df_stats, df_ml, 1, model)

### Country

In [None]:
main(df_stats, df_ml, 2, model)

### Education level 

In [None]:
main(df_stats, df_ml, 3, model)

### Profession

In [None]:
main(df_stats, df_ml, 4, model)

### experience 

In [None]:
main(df_stats, df_ml, 5, model)

### Employment status

In [None]:
main(df_stats, df_ml, 6, model)