In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt 
import seaborn as sns 
import plotly.express as px 
import plotly.graph_objects as go
import gc 
import math 
from tqdm import tqdm 
from wordcloud import WordCloud 
import warnings 
import math 

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from lightgbm import LGBMRegressor

pd.set_option("display.max_columns", None)
warnings.simplefilter("ignore")

In [None]:
df = pd.read_csv("../input/kaggle-survey-2021/kaggle_survey_2021_responses.csv")
df = df.iloc[1:, :].reset_index(drop=True)
df.head()

In [None]:
def get_language(df: pd.DataFrame):
    usecols = []
    # get use columns name and fill values.
    for col in df.columns.to_list():
        if col.find("Q7_Part") >= 0:
            lang = df[col].value_counts().index[0]
            usecols.append(col)
            df[col] = df[col].fillna("none")
            
    a = []  
    for row in range(df.shape[0]):
        x = df.iloc[row]
        b = []
        for cols in usecols:
            c = x[cols]
            if c != "none":
                b.append(c.lower())
            else:
                continue 
        a.append(",".join(b))
        del b, c 
        
    use_lang_len = [len(aa.split(",")) for aa in a]
    is_python = []
    for aa in a:
        if aa.find("python") >= 0:
            is_python.append(1)
        else:
            is_python.append(0)
            
    assert len(a) == len(use_lang_len) == len(is_python)
    gc.collect()
                
    return pd.DataFrame({"language": a, "use_lang_len": use_lang_len, "is_python": is_python})
            

In [None]:
%%time 

lang = get_language(df)
df = pd.merge(df, lang, how="left", left_index=True, right_index=True)
df = df[df["language"] != ""].reset_index(drop=True)

del lang
gc.collect()

df[["language", "use_lang_len", "is_python"]].head()

In [None]:
# create numericcal dataset 

def trans_age(x):
    if x.find("-") >= 0:
        return ( int(x.split("-")[0]) + int(x.split("-")[1]) ) / 2
    elif x.find("+"):
        return 70.0 
    else:
        return x 
    
def trans_employer(x):
    x = str(x)
    if x.find("-") >= 0:
        if x.find(",") >= 0:
            return ( int(x.split("-")[0]) + 9999) / 2 
        else:
            return int(x.split("-")[0]) + int(x.split("-")[1].split(" ")[0]) / 2 
    elif x.find(","):
        return 10000
    else:
        return x 
    
def trans_experience(x):
    x = str(x)
    if x.find("-") >= 0:
        return ( int(x.split("-")[0]) + int(x.split("-")[1].split(" ")[0]) / 2 )
    elif x.find("+") >= 0:
        return 20 
    elif x.find("<") >= 0:
        return 0
    else:
        return x    
    
def trans_money(x):
    x = str(x)
    x = x.replace("$", "")
    x = x.replace(",", "")
    if x.find("-") >= 0:
        return ( int(x.split("-")[0]) + int(x.split("-")[-1]) / 2)
    elif x.find("<") >= 0:
        return 1000000
    else:
        return 0
    
def trans_student(x):
    if x == "Student":
        return 1
    else:
        return 0 
    
def trans_data_engineer(x):
    x = str(x)
    if x.find("Data") >= 0 or x.find("Machine") >= 0 or x.find("Scientist") >= 0:
        return 1 
    else:
        return 0 
    
def trans_sex(x):
    x = str(x)
    if x == "Man":
        return 0 
    elif x == "Woman":
        return 1 
    else:
        return 2
    
def get_numerical(df: pd.DataFrame):
    q_time = df.iloc[:, 0].astype(int)
    age = df["Q1"].apply(trans_age)
    employ = df["Q21"].apply(trans_employer)
    money = df["Q25"].apply(trans_money)
    is_python = df["is_python"]
    is_student = df["Q5"].apply(trans_student)
    is_data = df["Q5"].apply(trans_data_engineer)
    sex = df["Q2"].apply(trans_sex)
    exp = df["Q6"].apply(trans_experience)
    use_lang_num = df["use_lang_len"]
    lang = df["language"]
    
    train = pd.DataFrame({"language": lang, 
                          "q_time": q_time, 
                         "age": age, 
                         "employment": employ,
                         "is_python": is_python,
                         "is_student": is_student, 
                         "is_data_analysis": is_data, 
                         "sex": sex, 
                         "experience": exp,
                         "use_lang_number": use_lang_num,
                         "money": money})
    train["money"] = train.money.apply(lambda x: train["money"].mean() if x == 0 else x)
    return train
    

In [None]:
train = get_numerical(df)

plt.figure(figsize=(20, 20))
sns.heatmap(round(train.corr(), 1), cmap="Blues", annot=True, fmt="")
plt.show()

+ It gives a positive correlation to years of experience, albeit slightly as the number of languages used increases. The ability to use python also has an impact.  
+ However, its impact on income is small, and just because you can use a lot of languages does not seem to increase it.

In [None]:
def viz_group(df: pd.DataFrame, col_name: str, title: str, xaxis_title: str, n: int=10, sort: bool=True):
    if sort:
        x = df.groupby(col_name).mean().loc[:, ["use_lang_len"]].sort_values("use_lang_len", ascending=False)[:n]
    else:
        x = df.groupby(col_name).mean().loc[:, ["use_lang_len"]][:n]
    fig = go.Figure(data=[
        go.Bar(name=xaxis_title, y=x.values.ravel()),
    ])

    # Change the bar mode
    fig.update_layout(
        barmode='group',
        title=title,
        xaxis_title=xaxis_title,
        yaxis_title="use language count",
        xaxis = dict(
            tickmode = 'array',
            tickvals = [x for x in range(x.shape[0])],
            ticktext = x.index
        )
    )
    fig.show()
    
def viz_pie(df: pd.DataFrame, col_name: str, title: str, n=10):
    x = df[col_name].value_counts().to_frame().sort_values(col_name, ascending=False)
    xx = x[:n]
    fig = px.pie(values=xx[col_name], names=xx.index, title=title)
    fig.update_traces(textposition='inside', textinfo='percent+label')
    fig.update_layout(uniformtext_minsize=10, uniformtext_mode='hide')
    fig.show()
    
def viz_location(df: pd.DataFrame, title: str):
    x = df.groupby("Q3").mean().loc[:, ["use_lang_len"]]
    x["country"] = x.index 
    x.reset_index(drop=True, inplace=True)
    
    fig = px.choropleth(x, 
                    locations = 'country',  
                    color = "use_lang_len",
                    locationmode = 'country names', 
                    color_continuous_scale = 'viridis',
                    title =  title,
                    range_color = [0, x["use_lang_len"].max()])
    fig.update(layout=dict(title=dict(x=0.5)))
    fig.show()
    
def viz_word(df: pd.DataFrame, col_name: str, title: str):
    unique_word = df[col_name].unique()
    fig, axes = plt.subplots(len(unique_word), 1, figsize=(30, 60))
    ax = axes.ravel()
    
    def create_word(ser: pd.Series):
        word_list = {}
        for se in ser.to_list():
            if len(se.split(",")) > 1:
                for lang in se.split(","):
                    if lang not in word_list:
                        word_list[lang] = 1 
                    else:
                        word_list[lang] += 1 
            else:
                lang = se 
                if lang not in word_list:
                    word_list[lang] = 1 
                else:
                    word_list[lang] += 1 
        return word_list 
                
    for i, word in enumerate(unique_word):
        x = df.loc[df[col_name] == word, "language"]
        word_cloud = WordCloud(background_color="white", width=2200, height=1440).generate_from_frequencies(create_word(x))
        ax[i].imshow(word_cloud)
        ax[i].set_title(word)
        ax[i].set_xticks([])
        ax[i].set_yticks([])
        
    plt.tight_layout()
    plt.suptitle(title)
    del x 
    gc.collect()

# Number of language RATE

In [None]:
viz_pie(df, "use_lang_len", "number of language")

# Number of language x Country

In [None]:
viz_location(df, "use language value counts by country")

# Number of language x Language

## Overall

In [None]:
viz_pie(df, "language", "popular combination language")

## number of

In [None]:
nums = sorted(df["use_lang_len"].unique().tolist())
for num in nums:
    viz_pie(df[df["use_lang_len"] == num], "language", f"language usage {str(num)}")

## not Python

In [None]:
viz_pie(df[df["is_python"] != 1], "language", "number of language by NOT Python")

# Number of language x Age 

In [None]:
viz_group(df, "Q1", "use language count by Age", "age", n=11, sort=False)

# Number of language x Educate 

In [None]:
viz_group(df, "Q4", "use language count by Education", "education")

# Number of language x job

In [None]:
viz_group(df, "Q5", "use language count by employment", "employer")

In [None]:
viz_word(df, "Q4", "language used by each business operator.")

# Number of language x Gender

In [None]:
viz_group(df, "Q2", "use language count by Gender", "gender")

+ Python is a popular language and always ranks high in combination.
+ Software developers are required to use more languages, about 0.5 counts, compared to data analysts.
+ Python seems to be the top in every job. However, keep in mind that the survey target is biased around the data analyst.  
+ It seems to be used relatively frequently in African countries.

# Predict income
Build a model that predicts your income. Let's see how much it affects the number of languages used by comparing the importance of the trained model.

In [None]:
train.head()

In [None]:
def fit(df: pd.DataFrame):
    x, y = df.drop(["language", "money"], axis=1), df[["money"]]
    x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.25, stratify=df["use_lang_number"], random_state=42)
    model = LGBMRegressor(random_state=42, n_estimators=2000)
    model.fit(x_train, y_train,
             eval_set=[(x_train, y_train), (x_val, y_val)], verbose=100)
    
    pred = model.predict(x_val).flatten()
    
    def mae(pred, corr):
        return np.mean(np.abs(pred - corr))
    print(f"mae: {mae(pred, y_val.values.ravel())}")
    return model 

def predict(model: object, df: pd.DataFrame):
    train = df.drop(["language", "money"], axis=1)
    num_lang = train["use_lang_number"].unique()
    sample_mean_predict, sample_std_predict, use_lang_num_list, ns = [], [], [], []
    for num in num_lang:
        x = train[train["use_lang_number"] == num]
        ns.append(x.shape[0])
        sample_std_predict.append(np.std(model.predict(x).flatten()))
        sample_mean_predict.append(np.mean(model.predict(x).flatten()))
        use_lang_num_list.append(num)
        
#     population_mean_predict = []
#     for n, mu, std in zip(ns, sample_mean_predict, sample_std_predict):
#         r_min = round(mu - 1.96 * (std / math.sqrt(n)), 1)
#         r_max = round(mu + 1.96 * (std / math.sqrt(n)), 1)
#         r = f"{r_min} ~ {r_max}"
#         population_mean_predict.append(r)
        
    result = pd.DataFrame({"use_lang_number": use_lang_num_list, "predict_income": sample_mean_predict}).sort_values("use_lang_number").reset_index(drop=True)
    return result.set_index("use_lang_number")

In [None]:
model = fit(train)

## compare predict and labels

In [None]:
df_predict = predict(model, train)

income = train.groupby("use_lang_number").mean().loc[:, ["money"]]
mu = train["money"].mean()

fig = go.Figure(data=[
    go.Bar(name='income', y=income.values.ravel()),
    go.Bar(name='predict_income', y=df_predict.values.ravel()),

])

# Change the bar mode
fig.update_layout(
    shapes=[
    dict(
      type= 'line',
      yref= 'y', y0=mu, y1=mu,
      xref= 'x', x0= -0.5, x1= income.shape[0]-0.5
    )],
    barmode='group',
    title=f'number of language by income',
    xaxis_title='number of language',
    yaxis_title='income',
    xaxis = dict(
        tickmode = 'array',
        tickvals = [x for x in range(income.shape[0])],
        ticktext = income.index
    )
)

fig.add_annotation(x=income.shape[0]*0.95, y=mu, xshift=-20, yshift=10,
            text="Global Average",
            showarrow=False)

fig.show()

## feature importance 

In [None]:
importance = pd.DataFrame({"feature_importance": model.feature_importances_ / np.sum(model.feature_importances_)}, index=train.drop(["language", "money"], axis=1).columns)

fig = px.pie(values=importance["feature_importance"], names=importance.index, title="Impact on income.")
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.update_layout(uniformtext_minsize=10, uniformtext_mode='hide')
fig.show()

+ Income may increase depending on the number of languages used.
+ Especially when it exceeds 8, you can see the rise. However, be aware of data bias.