# **Some analysis using kaggle 2021 survey responses**
## Tool network analysis and spatio-temporal visualization for global participants with the inspiration from course projects and reference from some other kagglers.

# Import packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
from wordcloud import WordCloud

# Load the data set

In [None]:
df_2021=pd.read_csv(r'../input/kaggle-survey-2021/kaggle_survey_2021_responses.csv',encoding='ISO-8859-1')

# Take a brief look at the data set.

In [None]:
df_2021.head(5)

# Generate some networks

## In fact, each participant can use more than one language and a variety of libraries; each job needs to be used on different platforms. As a result, these languages and algorithms are related to each other.
## These connections may result from multiple skills required for the same job, or from similarities in algorithms or the language-self. However, after all, they are related, and this relation may lead us to learn them with a better plan.
## Thus, here we construct the network on different topics for every data record to find the relation. They are not big-scaled complex networks, so even we skip the topological structure characteristics, there are quite some intuitive insights that we can have from these visualizations, with nodes’ size showing the number of users and edges representing a combined use of some tools for one participant.

In [None]:
df_lang = df_2021.iloc[:,7:18]
df_vis = df_2021.iloc[:,59:69]
df_algo = df_2021.iloc[:,90:100]
df_frame = df_2021.iloc[:,72:88]

In [None]:
df_lang.head(5)

In [None]:
df_lang.columns=['Python','R','SQL','C','C++','Java','Javascript','Julia','Swift','Bash','MATLAB']

In [None]:
col1 = df_lang.columns
col2 = [df_lang['Python'].value_counts()[0],
df_lang['R'].value_counts()[0],
df_lang['SQL'].value_counts()[0],
df_lang['C'].value_counts()[0],
df_lang['C++'].value_counts()[0],
df_lang['Java'].value_counts()[0],
df_lang['Javascript'].value_counts()[0],
df_lang['Julia'].value_counts()[0],
df_lang['Swift'].value_counts()[0],
df_lang['Bash'].value_counts()[0],
df_lang['MATLAB'].value_counts()[0]]
languages=pd.DataFrame({'Lang':col1,'Count':col2})


In [None]:
c = df_lang.stack().groupby(level=0).apply(tuple).value_counts()
out = [i + (j,) for i, j in c.items()]
out=[word for word in out if len(word)==3]
lang_net=pd.DataFrame(out)
lang_net.columns=['Lang 1','Lang 2','Count']
g = nx.from_pandas_edgelist(lang_net,source='Lang 1',target='Lang 2')
cmap = plt.cm.RdYlGn
colors = [n for n in range(len(g.nodes()))]
k = 0.35
pos=nx.spring_layout(g, k=k)
nx.draw_networkx(g,node_size=languages['Count'].values, cmap = cmap, node_color=colors, edge_color='grey', font_size=22, width=lang_net['Count'].values*0.01)
nx.write_gexf(g,'language.gexf')
plt.title('Languages Network',size=50)
plt.gcf().set_size_inches(25,20)

## We can clearly see that Python is pretty much used by most data scientists and used with other languages. This shows the popularity and versatility of Python. Besides, from the size of the edges, we see the major links of Python is with R and SQL, it makes sense because of the obvious fact that these data munging and analysis tools are the most frequently used tool-set, and associating these tools often makes efficient and complement of each other.

In [None]:
df_algo.head(5)

In [None]:
df_algo.columns=['Linear or Logistic Regression','Decision Trees or Random Forests','Gradient Boosting Machines (xgboost, lightgbm, etc)','Bayesian Approaches','Evolutionary Approaches'
,'Dense Neural Networks (MLPs, etc)','Convolutional Neural Networks','Generative Adversarial Networks','Recurrent Neural Networks','Transformer Networks (BERT, gpt-3, etc)']

In [None]:
col3 = df_algo.columns
col4 = [df_algo['Linear or Logistic Regression'].value_counts()[0],
df_algo['Decision Trees or Random Forests'].value_counts()[0],
df_algo['Gradient Boosting Machines (xgboost, lightgbm, etc)'].value_counts()[0],
df_algo['Bayesian Approaches'].value_counts()[0],
df_algo['Evolutionary Approaches'].value_counts()[0],
df_algo['Dense Neural Networks (MLPs, etc)'].value_counts()[0],
df_algo['Convolutional Neural Networks'].value_counts()[0],
df_algo['Generative Adversarial Networks'].value_counts()[0],
df_algo['Recurrent Neural Networks'].value_counts()[0],
df_algo['Transformer Networks (BERT, gpt-3, etc)'].value_counts()[0]]
algorithms = pd.DataFrame({'Algo':col3,'Count':col4})

In [None]:
c = df_algo.stack().groupby(level=0).apply(tuple).value_counts()
out = [i + (j,) for i, j in c.items()]
out=[word for word in out if len(word)==3]
algo_net=pd.DataFrame(out)
algo_net.columns=['Algo 1','Algo 2','Count']
g = nx.from_pandas_edgelist(algo_net,source='Algo 1',target='Algo 2')
cmap = plt.cm.RdYlGn
colors = [n for n in range(len(g.nodes()))]
k = 0.35
pos=nx.spring_layout(g, k=k)
nx.draw_networkx(g,node_size=algorithms['Count'].values, cmap = cmap, node_color=colors, edge_color='grey', font_size=22, width=algo_net['Count'].values*0.01)
nx.write_gexf(g,'algorithm.gexf')
plt.title('Algorithm Network',size=50)
plt.gcf().set_size_inches(25,20)

In [None]:
df_frame.head(5)

In [None]:
df_frame.columns=['Scikit-learn','TensorFlow','Keras','PyTorch','Fast.ai','MXNet','Xgboost','LightGBM','CatBoost','Prophet','H2O 3','Caret','Tidymodels','JAX','PyTorch Lightning','Huggingface']

In [None]:
col5 = df_frame.columns
col6 = [df_frame['Scikit-learn'].value_counts()[0],
df_frame['TensorFlow'].value_counts()[0],
df_frame['Keras'].value_counts()[0],
df_frame['PyTorch'].value_counts()[0],
df_frame['Fast.ai'].value_counts()[0],
df_frame['MXNet'].value_counts()[0],
df_frame['Xgboost'].value_counts()[0],
df_frame['LightGBM'].value_counts()[0],
df_frame['CatBoost'].value_counts()[0],
df_frame['Prophet'].value_counts()[0],
df_frame['H2O 3'].value_counts()[0],
df_frame['Caret'].value_counts()[0],
df_frame['Tidymodels'].value_counts()[0],
df_frame['JAX'].value_counts()[0],
df_frame['PyTorch Lightning'].value_counts()[0],
df_frame['Huggingface'].value_counts()[0]]
framework = pd.DataFrame({'Frame':col5,'Count':col6})

In [None]:
c = df_frame.stack().groupby(level=0).apply(tuple).value_counts()
out = [i + (j,) for i, j in c.items()]
out=[word for word in out if len(word)==3]
frame_net=pd.DataFrame(out)
frame_net.columns=['Frame 1','Frame 2','Count']
g = nx.from_pandas_edgelist(frame_net,source='Frame 1',target='Frame 2')
cmap = plt.cm.RdYlGn
colors = [n for n in range(len(g.nodes()))]
k = 0.35
pos=nx.spring_layout(g, k=k)
nx.draw_networkx(g,node_size=framework['Count'].values, cmap = cmap, node_color=colors, edge_color='grey', font_size=22, width=frame_net['Count'].values*0.01)
nx.write_gexf(g,'framework.gexf')
plt.title('Framework Network',size=50)
plt.gcf().set_size_inches(25,20)

In [None]:
df_vis.head(5)

In [None]:
df_vis.columns = ['Matplotlib','Seaborn','Plotly / Plotly Express','Ggplot / ggplot2','Shiny','D3 js','Altair','Bokeh','Geoplotlib','Leaflet / Folium']

In [None]:
col7 = df_vis.columns
col8 = [df_vis['Matplotlib'].value_counts()[0],
df_vis['Seaborn'].value_counts()[0],
df_vis['Plotly / Plotly Express'].value_counts()[0],
df_vis['Ggplot / ggplot2'].value_counts()[0],
df_vis['Shiny'].value_counts()[0],
df_vis['D3 js'].value_counts()[0],
df_vis['Altair'].value_counts()[0],
df_vis['Bokeh'].value_counts()[0],
df_vis['Geoplotlib'].value_counts()[0],
df_vis['Leaflet / Folium'].value_counts()[0]]
visualization = pd.DataFrame({'Frame':col7,'Count':col8})

In [None]:
c = df_vis.stack().groupby(level=0).apply(tuple).value_counts()
out = [i + (j,) for i, j in c.items()]
out=[word for word in out if len(word)==3]
vis_net=pd.DataFrame(out)
vis_net.columns=['Vis 1','Vis 2','Count']
g = nx.from_pandas_edgelist(vis_net,source='Vis 1',target='Vis 2')
cmap = plt.cm.RdYlGn
colors = [n for n in range(len(g.nodes()))]
k = 0.35
pos=nx.spring_layout(g, k=k)
nx.draw_networkx(g,node_size=visualization['Count'].values, cmap = cmap, node_color=colors, edge_color='grey', font_size=22, width=vis_net['Count'].values*0.01)
nx.write_gexf(g,'visualization.gexf')
plt.title('Visualization Tools Network',size=50)
plt.gcf().set_size_inches(25,20)

## There are several bit nodes sharing the popularity, which are Matplotlib, Seaborn, Plotly / Plotly Express and Ggplot / ggplot2. The first three of them comes from Python, and Ggplot comes from R, both Python and R are widely used programming languages and they are often used together as discussed before, so the result here seems fairly reasonable. One thing easy to notice is the big chunk between Matplotlib and Seaborn, which are the two most popular visualization packages in Python, and they are very often used together according to the result. This really makes sense not only from research, but also applies well to ourselves, considering their complements to each other.

In [None]:
df_env = df_2021.iloc[:,21:32]
df_share = df_2021.iloc[:,233:241]
df_start = df_2021.iloc[:,243:253]

In [None]:
df_env.columns = ['Jupyter (JupyterLab, Jupyter Notebooks, etc)','RStudio','Visual Studio','Visual Studio Code (VSCode)','PyCharm','Spyder','Notepad++','Sublime Text','Vim / Emacs','MATLAB','Jupyter Notebook']

In [None]:
col9 = df_env.columns
col10 = [df_env['Jupyter (JupyterLab, Jupyter Notebooks, etc)'].value_counts()[0],
df_env['RStudio'].value_counts()[0],
df_env['Visual Studio'].value_counts()[0],
df_env['Visual Studio Code (VSCode)'].value_counts()[0],
df_env['PyCharm'].value_counts()[0],
df_env['Spyder'].value_counts()[0],
df_env['Notepad++'].value_counts()[0],
df_env['Sublime Text'].value_counts()[0],
df_env['Vim / Emacs'].value_counts()[0],
df_env['MATLAB'].value_counts()[0],
df_env['Jupyter Notebook'].value_counts()[0]]

environment = pd.DataFrame({'Env':col9,'Count':col10})

In [None]:
c = df_env.stack().groupby(level=0).apply(tuple).value_counts()
out = [i + (j,) for i, j in c.items()]
out=[word for word in out if len(word)==3]
env_net=pd.DataFrame(out)
env_net.columns=['Env 1','Env 2','Count']
g_env = nx.from_pandas_edgelist(env_net,source='Env 1',target='Env 2')
cmap = plt.cm.RdYlGn
colors = [n for n in range(len(g_env.nodes()))]
k = 0.35
pos=nx.spring_layout(g_env, k=k)
nx.draw_networkx(g_env,node_size=environment['Count'].values, cmap = cmap, node_color=colors, edge_color='grey', font_size=22, width=env_net['Count'].values*0.01)
nx.write_gexf(g_env,'visualization.gexf')
plt.title('Develop Environment Network',size=50)
plt.gcf().set_size_inches(25,20)

In [None]:
df_share.columns = ['Plotly Dash','Streamlit','NBViewer','GitHub','Personal blog','Kaggle','Colab','Shiny']

In [None]:
col11 = df_share.columns
col12 = [df_share['Plotly Dash'].value_counts()[0],
df_share['Streamlit'].value_counts()[0],
df_share['NBViewer'].value_counts()[0],
df_share['GitHub'].value_counts()[0],
df_share['Personal blog'].value_counts()[0],
df_share['Kaggle'].value_counts()[0],
df_share['Colab'].value_counts()[0],
df_share['Shiny'].value_counts()[0]]

share = pd.DataFrame({'Share':col11,'Count':col12})

In [None]:
c = df_share.stack().groupby(level=0).apply(tuple).value_counts()
out = [i + (j,) for i, j in c.items()]
out=[word for word in out if len(word)==3]
share_net=pd.DataFrame(out)
share_net.columns=['Share 1','Share 2','Count']
g_share = nx.from_pandas_edgelist(share_net,source='Share 1',target='Share 2')
cmap = plt.cm.RdYlGn
colors = [n for n in range(len(g_share.nodes()))]
k = 0.35
pos=nx.spring_layout(g_share, k=k)
nx.draw_networkx(g_share,node_size=[ 4586, 3065,  1848, 705,  387, 305, 293,  136], cmap = cmap, node_color=colors, edge_color='grey', font_size=22, width=share_net['Count'].values*0.01)
nx.write_gexf(g_share,'share.gexf')
plt.title('Sharing Platform Network',size=30)
plt.gcf().set_size_inches(15,10)

In [None]:
df_start.columns = ['Coursera','edX','Kaggle Learn Courses','DataCamp','Fast.ai','Udacity','Udemy','LinkedIn Learning','Cloud-certification programs','University Courses']

In [None]:
col13 = df_start.columns
col14 = [df_start['Coursera'].value_counts()[0],
df_start['edX'].value_counts()[0],
df_start['Kaggle Learn Courses'].value_counts()[0],
df_start['DataCamp'].value_counts()[0],
df_start['Fast.ai'].value_counts()[0],
df_start['Udacity'].value_counts()[0],
df_start['Udemy'].value_counts()[0],
df_start['LinkedIn Learning'].value_counts()[0],
df_start['Cloud-certification programs'].value_counts()[0],
df_start['University Courses'].value_counts()[0]]

start = pd.DataFrame({'Share':col13,'Count':col14})

In [None]:
c = df_start.stack().groupby(level=0).apply(tuple).value_counts()
out = [i + (j,) for i, j in c.items()]
out=[word for word in out if len(word)==3]
start_net=pd.DataFrame(out)
start_net.columns=['Start 1','Start 2','Count']
g_start = nx.from_pandas_edgelist(start_net,source='Start 1',target='Start 2')
cmap = plt.cm.RdYlGn
colors = [n for n in range(len(g_start.nodes()))]
k = 0.35
pos=nx.spring_layout(g_start, k=k)
nx.draw_networkx(g_start,node_size=start['Count'].values, cmap = cmap, node_color=colors, edge_color='grey', font_size=22, width=start_net['Count'].values*0.01)
nx.write_gexf(g_start,'start.gexf')
plt.title('Starting Platform Network',size=50)
plt.gcf().set_size_inches(25,20)

## As for some other networks, ways of understanding them are quite similar, we can learn what are the popular development environments, machine learning algorithms, data science sharing platforms and learning platforms, from which you can know where to find others’ data science projects and learning materials. At the same time, they also illustrate the preference of using the combination of tools for each topic.

# Generate a wordcloud

In [None]:
languages.columns = ['Text','Count']
algorithms.columns = ['Text','Count']
framework.columns = ['Text','Count']
visualization.columns = ['Text','Count']
environment.columns = ['Text','Count']
share.columns = ['Text','Count']
start.columns = ['Text','Count']

In [None]:
text_dic = pd.concat([languages,algorithms,framework,visualization,environment,share,start])

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

wc = WordCloud(background_color='white')
fp = text_dic
name = list(fp.Text)  # Word
value = fp.Count  # Frequency
for i in range(len(name)):
    name[i] = str(name[i])
dic = dict(zip(name, value)) 
wc.generate_from_frequencies(dic)  # generate wordcloud

plt.imshow(wc)
plt.axis("off")  
plt.show()
wc.to_file('wordcloud.png') 


## The wordcloud is a vivid presentation of all topics above about what appears most frequently in this research, which is pretty consistent with the visualizations we have before, and shows us a world of people in the field of data science.

# Visualize the global data scientists and the change in year 2017 - 2021

In [None]:
df_2021_raw = pd.read_csv(r'../input/kaggle-survey-2021/kaggle_survey_2021_responses.csv',encoding='ISO-8859-1')
df_2021 = df_2021_raw.iloc[1:,:]
df_2020_raw = pd.read_csv(r'../input/kaggle-survey-2020/kaggle_survey_2020_responses.csv',encoding='ISO-8859-1')
df_2020 = df_2020_raw.iloc[1:,:]
df_2019_raw = pd.read_csv(r'../input/kaggle-survey-2019/multiple_choice_responses.csv',encoding='ISO-8859-1')
df_2019 = df_2019_raw.iloc[1:,:]
df_2018_raw = pd.read_csv(r'../input/kaggle-survey-2018/multipleChoiceResponses.csv',encoding='ISO-8859-1')
df_2018 = df_2018_raw.iloc[1:,:]
df_2017 = pd.read_csv(r'../input/kaggle-survey-2017/multipleChoiceResponses.csv',encoding='ISO-8859-1')


In [None]:
df_2017.head(5)

In [None]:
import plotly.offline as py
py.offline.init_notebook_mode()

def plot_in_map(locations,counts,title):
    data = [ dict(
            type = 'choropleth',
            locations = locations,
            z = counts,
            zmax = 8000,
            zmin = 0,
            locationmode = 'country names',
            autocolorscale = True,
            marker = dict(
                line = dict(color = 'rgb(58,100,69)', width = 0.6)),
                colorbar = dict(autotick = True, tickprefix = '', title = 'Number of participants')
                )
           ]
    layout = dict(
        title = title,
        geo = dict(
            showframe = True,
            showcoastlines = True,
            projection = dict(
            type = 'equirectangular'
            ),
        margin = dict(b = 0, t = 0, l = 0, r = 0)
                ),
        )

    fig = dict(data=data, layout=layout)
    
    py.iplot(fig, validate=False, filename='world-map')

In [None]:
z_2021 = df_2021['Q3'].value_counts()
plot_in_map(locations=z_2021.index,
            counts=z_2021.values,
            title='Participants by Country 2021')

In [None]:
z_2020 = df_2020['Q3'].value_counts()
plot_in_map(locations=z_2020.index,
            counts=z_2020.values,
            title='Participants by Country 2020')

In [None]:
z_2019 = df_2019['Q3'].value_counts()
plot_in_map(locations=z_2019.index,
            counts=z_2019.values,
            title='Participants by Country 2019')

In [None]:
z_2018 = df_2018['Q3'].value_counts()
plot_in_map(locations=z_2018.index,
            counts=z_2018.values,
            title='Participants by Country 2018')

In [None]:
z_2017 = df_2017['Country'].value_counts()
plot_in_map(locations=z_2017.index,
            counts=z_2017.values,
            title='Participants by Country 2017')

## We constructed some global heat maps intending to explore how the number of data scientists changed during several years. Spatio-temporal maps for global data scientists from 2017 to 2021 based on the number of participants from different countries are shown as follows. There is no obvious increase for most countries except India, which is counterintuitive, and it may be related to the variation of participants of each year’s survey.