In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt 
import seaborn as sns 
import plotly.express as px 
import plotly.graph_objects as go
import gc 
import math 
from tqdm import tqdm 
from wordcloud import WordCloud 
import scipy as sp
import nltk 
from nltk.corpus import stopwords

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import TSNE 
from sklearn.cluster import KMeans 

pd.set_option("display.max_columns", None)

In [None]:
%%time 

df = pd.read_csv("../input/kaggle-survey-2021/kaggle_survey_2021_responses.csv", header=1)
df.head()

Since we want to aggregate according to each language, we will format the data frame as language-centric.  
Specifically, the language column removes missing values, allowing duplicates of people.

In [None]:
language = []
for col in df.columns.to_list():
    if col.find("What programming languages do you use on a regular basis?") == 0:
        language.append(col.split("-")[-1].strip())
        
for i, lang in tqdm(enumerate(language)):
    col_name = f"What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - {lang}"
    x = df[df[col_name].notna()]
    x = x.rename(columns={col_name: "language"})
    if i == 0:
        df_program = x 
    else:
        df_program = pd.concat([df_program, x])
        
df_program.reset_index(drop=True, inplace=True)

print(f"before: {df.shape[0]} | after: {df_program.shape[0]}")

del x 
gc.collect()

In [None]:
fig = px.pie(df_program, df_program["language"], title="Programming value counts by All.")

fig.update_traces(textposition='inside', textinfo='percent+label')
fig.update_layout(uniformtext_minsize=10, uniformtext_mode='hide')
fig.show()

Estimate the percentage of the population mean using a statistical test from the usage of each language.  
The significance level was used at 95%.

In [None]:
def calc_population_mean(df):
    lang, r_list = [], []
    for p, n, l in zip(df["language"].value_counts(normalize=True).values, df["language"].value_counts().values, df["language"].value_counts().index):
        r_min = round(p - 1.96 * (math.sqrt( (p*(1-p)) / n )), 4)
        r_max = round(p + 1.96 * (math.sqrt( (p*(1-p)) / n )), 4)
        r = f"{str(r_min*100.0)}%~{str(r_max*100.0)}%"
        lang.append(l)
        r_list.append(r)
    return pd.DataFrame({"population_mean": r_list}, index=lang)


sample_mean = df_program["language"].value_counts(normalize=True).to_frame().sort_values("language", ascending=False)
sample_mean["language"] = sample_mean["language"] * 100.0 
sample_mean["language"] = sample_mean["language"].apply(lambda x: str(round(x, 2))+"%")
sample_mean = sample_mean.rename(columns={"language": "sample_mean"})

populatin_mean = calc_population_mean(df_program)

display(sample_mean)
display(populatin_mean)

## Laguage x Age x Sex 

In [None]:
df_program["age"] = df_program["What is your age (# years)?"].apply(lambda x: int(70) if x == "70+" else int(x.split("-")[0]) + int(x.split("-")[1]) / 2)

ims = []
for sex in ["Man", "Woman"]:
    age = df_program[df_program["What is your gender? - Selected Choice"] == sex].groupby("language").mean().loc[:, ["age"]]
    ims.append(age)
    
fig = go.Figure(data=[
    go.Bar(name="Man", y=ims[0].values.ravel()),
    go.Bar(name='Woman', y=ims[1].values.ravel())
])

# Change the bar mode
fig.update_layout(
    shapes=[
    dict(
      type= 'line',
      yref= 'y', y0= df_program["age"].mean(), y1=df_program["age"].mean(),
      xref= 'x', x0= -0.5, x1= age.shape[0]-0.5
    )],
    barmode='group',
    title=f'Language x Sex x Age',
    xaxis_title='Language',
    yaxis_title='Age',
    xaxis = dict(
        tickmode = 'array',
        tickvals = [x for x in range(ims[0].shape[0])],
        ticktext = ims[0].index
    )
)

fig.add_annotation(x=age.shape[0]*0.95, y=df_program["age"].mean(), xshift=-20, yshift=10,
            text="Global Average",
            showarrow=False)

fig.show()

df_program["is_python"] = df_program.language.apply(lambda x: "Python" if x == "Python" else "other")

ims = []
for sex in ["Man", "Woman"]:
    age = df_program[df_program["What is your gender? - Selected Choice"] == sex].groupby("is_python").mean().loc[:, ["age"]]
    ims.append(age)
    
fig = go.Figure(data=[
    go.Bar(name="Man", y=ims[0].values.ravel()),
    go.Bar(name='Woman', y=ims[1].values.ravel())
])

# Change the bar mode
fig.update_layout(
    barmode='group',
    title=f'Is Python x Sex x Age',
    xaxis_title='Language',
    yaxis_title='Age',
    xaxis = dict(
        tickmode = 'array',
        tickvals = [x for x in range(ims[0].shape[0])],
        ticktext = ims[0].index
    )
)


fig.show()


### x Country

Visualize the usage rate in each country. However, please note that this is just a sample mean and does not reflect the entire population.

In [None]:
def show_location_use_language(df: pd.DataFrame, language: str, title: str):
    x = country.T 
    x = x[[language]]
    x["country"] = x.index
    x.reset_index(drop=True, inplace=True)
    
    fig = px.choropleth(x, 
                    locations = 'country',  
                    color = language,
                    locationmode = 'country names', 
                    color_continuous_scale = 'viridis',
                    title =  language,
                    range_color = [0, x[language].max()])
    fig.update(layout=dict(title=dict(x=0.5)))
    fig.show()

country = pd.crosstab(df_program["language"], df_program["In which country do you currently reside?"])
country = country / country.sum(axis=0)

for lang in language:
    show_location_use_language(country, lang, lang)

In Africa, the population parameter itself is small, but it shows a high proportion by specializing in one language.  
It seems that a small number of programmers strongly reflect it because the infrastructure is not well developed.

### Clustering 
Clustering is performed from the language usage table between countries.  
This divides the use of programming languages between countries more significantly.

In [None]:
decompose = TSNE(n_components=3, random_state=42)
tsne = decompose.fit_transform(country)

km = KMeans(n_clusters=3, random_state=42)
y_km = km.fit_predict(tsne)

fig = px.scatter(x=tsne[:, 0],
                 y=tsne[:, 1],
                 color=y_km, 
                title="decompose Language by TSNE")
fig.show()

In [None]:
def show_cluster(country: pd.DataFrame, cluster=0):
    lang = country[country["cluster"] == cluster].index.to_list()
    
    x = df_program[df_program["language"].isin(lang)]
    
    fig = px.pie(x, x["What is your age (# years)?"], title=f'Age value counts by cluster {cluster}', hole=0.6)
    fig.update_traces(textinfo='percent+label')
    fig.update_layout(uniformtext_minsize=10, uniformtext_mode='hide')
    fig.show()
    
    fig = px.pie(x, x["What is your gender? - Selected Choice"], title=f"Sex value counts by cluster {cluster}", hole=0.6)
    fig.update_traces(textinfo='percent+label')
    fig.update_layout(uniformtext_minsize=10, uniformtext_mode='hide')
    fig.show()
    
    role = x["Select the title most similar to your current role (or most recent title if retired): - Selected Choice"].value_counts()
    fig = go.Figure(data=[
        go.Bar(name="Role", y=role.ravel()),
    ])

    # Change the bar mode
    fig.update_layout(
        barmode='group',
        title=f'Role value count by cluster {cluster}',
        xaxis_title='Role',
        yaxis_title='Count',
        xaxis = dict(
            tickmode = 'array',
            tickvals = [x for x in range(len(role.index))],
            ticktext = role.index
        )
    )

    fig.show()
    
    word = WordCloud(background_color="white").generate(" ".join(x["language"]))
    plt.figure(figsize=(22, 12))
    plt.imshow(word)
    plt.axis("off")
    

### data science language 

In [None]:
country["cluster"] = y_km 
show_cluster(country)

### web application language 


In [None]:
show_cluster(country, 1)

### java 
java exists as the only outlier and is lonely in the cluster.

In [None]:
show_cluster(country, 2)

I was able to successfully separate the purpose of the programming language from the separation information between countries. Perhaps the use of languages is separate by country.

In [None]:
date_science = country[country["cluster"] == 0].index.unique().to_list()
web_application = country[country["cluster"] == 1].index.unique().to_list()
java = country[country["cluster"] == 2].index.unique().to_list()

### x Role

In [None]:
'''
ここから先をpythonかどうかでぶんかつするのではなく上のクラスタ分析で得た言語を比較していくコードに変更すること。

'''

In [None]:
role = df_program["Select the title most similar to your current role (or most recent title if retired): - Selected Choice"].value_counts(normalize=True)
fig = go.Figure(data=[go.Bar(x=role.index, y=role.values)])

fig.update_layout(
    title="role x Language by All",
    xaxis_title=None,
    yaxis_title='Age')
fig.show()


role_data = df_program.loc[df_program["language"].isin(date_science), "Select the title most similar to your current role (or most recent title if retired): - Selected Choice"].value_counts(normalize=True)
role_web = df_program.loc[df_program["language"].isin(web_application), "Select the title most similar to your current role (or most recent title if retired): - Selected Choice"].value_counts(normalize=True)
role_java = df_program.loc[df_program["language"].isin(java), "Select the title most similar to your current role (or most recent title if retired): - Selected Choice"].value_counts(normalize=True)

fig = go.Figure(data=[
    go.Bar(name="date_science_language", y=role_data.values),
    go.Bar(name='web_application_language', y=role_web.values),
    go.Bar(name='java_language', y=role_java.values),

])

fig.update_layout(
    barmode='group',
    title=f"Role by language used", 
    xaxis_title='role',
    yaxis_title='Percentage of respondents',
    xaxis = dict(
        tickmode = 'array',
        tickvals = [x for x in range(role_data.shape[0])],
        ticktext = role_data.index
    )
)

fig.show()

### Search for similar languages from users

In [None]:
role = pd.crosstab(df_program["language"], df_program["Select the title most similar to your current role (or most recent title if retired): - Selected Choice"])
role = role / role.sum(axis=0)

df_sparse = sp.sparse.csr_matrix(role.values)
df_sparse = cosine_similarity(df_sparse)
role = pd.DataFrame(df_sparse, columns=role.index, index=role.index)
role.head()

In [None]:
def search_for_similar_lang(lang):
    x = role[[lang]].sort_values(lang, ascending=False)[1:5+1]
    x.columns = ["similar"]
    return x 

nltk.download('stopwords')
stop_words = stopwords.words('english')

def create_word_count(doc_list):
    word2count = {}
    for doc in doc_list:
        for word in doc.split():
            if word in stop_words:
                continue 
            if word not in word2count:
                word2count[word] = 1 
            else:
                word2count[word] += 1 
    return word2count 

def show_search_for_similar_lang(lang: str):
    print("="*100, lang, "="*100)
    similar = search_for_similar_lang(lang)
    display(similar)
    
    x = df_program[df_program["language"].isin(similar.index.to_list())]
    word = WordCloud(background_color="white", width=2000, height=1450, max_words=15).generate_from_frequencies(
        create_word_count(x.loc[x["Select any activities that make up an important part of your role at work: (Select all that apply) - Selected Choice - Analyze and understand data to influence product or business decisions"].notna(), 
                                "Select any activities that make up an important part of your role at work: (Select all that apply) - Selected Choice - Analyze and understand data to influence product or business decisions"])
    )
    plt.figure(figsize=(22, 12))
    plt.imshow(word)
    plt.title("Words that frequently appear from roles")
    plt.axis("off")
    plt.show()
    
    role_current = df_program.loc[df_program["language"] == lang, "Select the title most similar to your current role (or most recent title if retired): - Selected Choice"].value_counts(normalize=True)
    role_similar = x["Select the title most similar to your current role (or most recent title if retired): - Selected Choice"].value_counts(normalize=True)
    
    fig = go.Figure(data=[
        go.Bar(name=lang, y=role_current.values),
        go.Bar(name='Others Similar', y=role_similar.values) 
    ])

    fig.update_layout(
        barmode='group',
        title=f"{lang} x similar language in role", 
        xaxis_title='role',
        yaxis_title='Percentage of respondents',
        xaxis = dict(
            tickmode = 'array',
            tickvals = [x for x in range(role_current.shape[0])],
            ticktext = role_current.index
        )
    )

    fig.show()

In [None]:
for lang in language:
    show_search_for_similar_lang(lang)

## x how many years write code. 

In [None]:
def calc_many_code(x):
    if x == "< 1 years":
        return 1 
    elif x == "20+ years":
        return 20 
    elif x.find("years") >= 0:
        return ( int(x.split("-")[0]) + int(x.split("-")[1].split()[0]) ) / 2
    else:
        return x 

df_program["write_code"] = df_program["For how many years have you been writing code and/or programming?"].apply(calc_many_code)


fig = px.pie(df, df["For how many years have you been writing code and/or programming?"], title='how many years writing code?', hole=0.6)
fig.update_traces(textinfo='percent+label')
fig.update_layout(uniformtext_minsize=10, uniformtext_mode='hide')
fig.show()

write_code_lang = df_program.groupby("language").mean().loc[:, ["write_code"]]
write_code_age = df_program.groupby("What is your age (# years)?").mean().loc[:, ["write_code"]]

fig = go.Figure(data=[go.Bar(x=write_code_lang.index, y=write_code_lang.write_code)])
fig.update_layout(
    title="Years of experience for each language",
    xaxis_title=None,
    yaxis_title='how many year write code')
fig.show()

fig = go.Figure(data=[go.Bar(x=write_code_age.index, y=write_code_age.write_code)])
fig.update_layout(
    title="Years of experience for each age",
    xaxis_title=None,
    yaxis_title='how many year write code')
fig.show()

data_ = df_program.loc[df_program["language"].isin(date_science), "For how many years have you been writing code and/or programming?"].value_counts(normalize=True)
web_ = df_program.loc[df_program["language"].isin(web_application), "For how many years have you been writing code and/or programming?"].value_counts(normalize=True)
java_ = df_program.loc[df_program["language"].isin(java), "For how many years have you been writing code and/or programming?"].value_counts(normalize=True)

fig = go.Figure(data=[
    go.Bar(name="data_science_language", y=data_.values),
    go.Bar(name='web_application_language', y=web_.values) ,
    go.Bar(name='java', y=java_.values) ,

])

fig.update_layout(
    barmode='group',
    title=f"Experience by language used", 
    xaxis_title='experience',
    yaxis_title='Percentage of respondents',
    xaxis = dict(
        tickmode = 'array',
        tickvals = [x for x in range(data_.shape[0])],
        ticktext = data_.index
    )
)

fig.show()

From the history of AI's recent attention, the users of data analysis languages are concentrated in the younger generation.

In [None]:
def calc_class_categorical(col_name: str) -> pd.DataFrame:
    ide_cols = [col for col in df.columns if col_name in col]
    mapper = [col.split('-')[-1].lstrip() for col in ide_cols]
    mapping_dict = dict(zip(ide_cols,mapper))
    df_ids = df_program[ide_cols  + ['language']].rename(columns=mapping_dict)
    df_ids.dropna(how='all', subset=mapper, inplace=True)
    
    return df_ids  

def show_bar(df, title, yaxis_title):
    data_ = df[df["language"].isin(date_science)].count() / df[df["language"].isin(date_science)].count().sum()
    web_ = df[df["language"].isin(web_application)].count() / df[df["language"].isin(web_application)].count().sum()
    java_ = df[df["language"].isin(java)].count() / df[df["language"].isin(java)].count().sum()

    
    fig = go.Figure(data=[
        go.Bar(name="data_science_language", y=data_.values),
        go.Bar(name='web_application_language', y=web_.values),
        go.Bar(name='java', y=java_.values) 

    ])

    fig.update_layout(
        barmode='group',
        title=f"{title} by language used", 
        xaxis_title='percentage',
        yaxis_title=yaxis_title,
        xaxis = dict(
            tickmode = 'array',
            tickvals = [x for x in range(data_.shape[0])],
            ticktext = data_.index
        )
    )
    fig.show()

## x development enviroments (IDE)

In [None]:
dfs = calc_class_categorical("Which of the following integrated development environments (IDE's) do you use on a regular basis?")
show_bar(dfs, "IDE", "IDE")

## x visualization libraries 

In [None]:
dfs = calc_class_categorical("What data visualization libraries or tools do you use on a regular basis?")
show_bar(dfs, "visualization libraries", "library")

## x framework

In [None]:
dfs = calc_class_categorical("Which of the following machine learning frameworks do you use on a regular basis?")
show_bar(dfs, "framework", "framework")

## x cloud computing platforms

In [None]:
dfs = calc_class_categorical("Which of the following cloud computing platforms do you use on a regular basis?")
show_bar(dfs, "platform", "platform")

It can be said that it is difficult to separate each language neatly due to the fact that the survey subjects are kaggle. To be clearer, we'll add app developers to the survey audience.  
However, it is surprising that there are users other than python, r, and sql, which are relatively royal roads in analysis.