In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
from plotly import subplots
import plotly.figure_factory as ff
import matplotlib.pyplot as plt

from pandas_profiling import ProfileReport
import seaborn as sns
from sklearn import metrics
from scipy import stats

from copy import deepcopy

In [None]:
# Defining all our palette colours.
primary_blue = "#496595"
primary_blue2 = "#85a1c1"
primary_blue3 = "#3f4d63"
primary_grey = "#c6ccd8"
primary_black = "#202022"
primary_bgcolor = "#f4f0ea"

primary_green = px.colors.qualitative.Plotly[2]

plt.rcParams['axes.facecolor'] = primary_bgcolor

colors = [primary_blue, primary_blue2, primary_blue3, primary_grey, primary_black, primary_bgcolor, primary_green]
sns.palplot(sns.color_palette(colors))

In [None]:
df = pd.read_csv('../input/kaggle-survey-2020/kaggle_survey_2020_responses.csv')
df.head()

In [None]:
df=df.loc[1:, :]
df.head()

In [None]:
df.isna().sum()

In [None]:
#Split between professionals and students
q23c = [column for column in df.columns if 'Q23' in column]
q23c

In [None]:
df.shape

In [None]:
df[q23c].isnull().sum(axis=1)

In [None]:
len(q23c)

In [None]:
df.loc[20033, "Q23_Part_1":"Q23_OTHER"]

In [None]:
df_prof = df[df[q23c].isnull().sum(axis=1)!=len(q23c)]
df.loc[df[q23c].isnull().sum(axis=1) != len(q23c), 'Prof_Stud'] = 'Professional'

df_stud = df[df['Q5']=='Student']
df.loc[df['Q5']=='Student', 'Prof_Stud'] = 'Student'

In [None]:
df_stud.head()

In [None]:
df_prof.head()

<h1>Age

In [None]:
# The Q1 answer becomes the index, then we reset it so it takes the name of 'index' column
pd.DataFrame(df['Q1'].value_counts().reset_index())

In [None]:
q = pd.DataFrame(df['Q1'].value_counts().reset_index().sort_values('index'))
q.columns = ['ages', 'value'] #rename columns

fig = go.Figure(go.Scatter(
                x=q['ages'],
                y=q['value'],
                mode='lines+markers+text',
                text=q['value'],
                textposition='top right'
))

fig.update_layout(
    xaxis={'categoryorder':'category ascending'})

fig.show()

<h1>Gender

In [None]:
df['Q2']

In [None]:
q = df['Q2'].value_counts().sort_values()
fig = go.Figure(go.Bar
               (x=q.values,
               y=q.index,
               text=q.values,
               textposition="auto",
               orientation='h'))
fig.update_traces(
    marker_line_color = 'black',
    marker_line_width = 1, 
    opacity = 0.8,
)
fig.show()

<h2>Country

In [None]:
pd.DataFrame(df['Q3'].value_counts()[:10].reset_index())

In [None]:
#Automatically sorts in descending order
q3 = pd.DataFrame(df['Q3'].value_counts()[:10].reset_index())
q3.loc[q3['index'] == 'United States of America', 'index'] = 'United States'
q3.loc[q3['index'] == 'United Kingdom of Great Britain and Northern Ireland', 'index'] = 'United Kingdom'

regions_df = pd.read_csv('/kaggle/input/countries-iso-codes-continent-flags-url/countries_continents_codes_flags_url.csv')
q3 = q3.merge(regions_df, left_on='index', right_on='country', how='left')

fig = go.Figure(go.Bar(
                x=q3['index'],
                y=q3['Q3'],
                text = q.values,
                textposition="auto",
                texttemplate = "%{value:,s}",))

fig.update_traces(
    marker_color='rgb(158,202,225)', 
    marker_line_color='rgb(8,48,107)',
    marker_line_width=1.5, 
    opacity=0.6,
)
max_y_val = q3['Q3'].max()
for country, flag_url, ppl_vac in zip(q3['index'], q3['image_url'], q3['Q3']):
    if not flag_url or not isinstance(flag_url, str):
        continue
    fig.add_layout_image(
        dict(
            source=flag_url,
            x=country, 
            y=ppl_vac + 0.1 * max_y_val,
            sizex=0.5, 
            sizey=0.08 * max_y_val,
            xanchor="center", yanchor="bottom",
            sizing='stretch',
            xref='x',
            yref="y",
        ),
    )

fig.update_yaxes(range=[0, max_y_val + 0.2*max_y_val])
fig.update_layout(
    title = 'Top 10: Residence country',
)
fig.show()

<h2>Education

In [None]:
q4 = df['Q4'].value_counts()[:10]
fig = go.Figure(
go.Pie(
labels=q4.index,
values=q4.values,
pull = [0.1]))

fig.update_traces(
    hoverinfo='label+percent', 
    textinfo='value', 
    textfont_size=15,
    marker=dict(
        line=dict(color='#000000', width=1)
    )
)
fig.update_layout(
    title = 'Kagglers education',
)
fig.show()

In [None]:
q5 = df['Q5'].value_counts()

colors = px.colors.qualitative.Light24

fig = go.Figure(go.Bar(
    y = q5.values,
    x = q5.index,
    text = q5.values,
    textposition="auto",
    texttemplate = "%{value:,s}",
    marker_color = colors,
    # orientation='h',
))
fig.update_traces(
    marker_line_color = 'black',
    marker_line_width = 1, 
    opacity = 0.8,
)
fig.show()

In [None]:
df['Q5']

In [None]:
#https://plotly.com/python/horizontal-bar-charts/
fields = df['Q5'].unique()
education_df = pd.DataFrame()

colours_coffe = ["#a2885e", "#e9cf87", "#f1efd9", "#8eb3aa", "#235f83", primary_blue3]

for field in fields:
    education_df[field] = df[df["Q5"]==field]['Q4'].value_counts()
education_df

In [None]:
df[df["Q5"]=='Student']['Q4'].value_counts()

In [None]:
education_df = education_df.dropna(axis=1, how='all')
education_df = education_df.drop('I prefer not to answer')
education_df = education_df / education_df.sum()
education_df

In [None]:
#Putting rows in order of qualification
education_df = education_df.reindex(["Doctoral degree", "Master’s degree", 
                                     "Professional degree", "Bachelor’s degree", 
                                     "Some college/university study without earning a bachelor’s degree", 
                                     "No formal education past high school"])
education_df

In [None]:
education_df.index

In [None]:
education_df.loc['Doctoral degree']

In [None]:
traces = []
for i, field in enumerate(education_df.index):
    traces.append(go.Bar(
        y = education_df.columns,
        x = education_df.loc[field],
        name = field,
        marker = dict(color = colours_coffe[i]),
        orientation = "h"
    ))
    
large_title_format = "<span style='font-size:30px; font-family:Times New Roman'>What educational qualifications do I need?</span>"
layout = dict(
    title = dict(text=large_title_format, x=0.5, y=0.963),
    xaxis = dict(
        title="<span style='font-size:13px; font-family:Helvetica'><b>Color Key: </b>Educational qualifications of professionals</span>", 
        side="top",
        title_standoff=0, 
        domain=[0,0.95], 
        showticklabels = False
    ),
    xaxis2 = dict(domain=[0, 1], tickformat = '%'),
    yaxis = dict(domain=[0.85,1], showticklabels = False),
    yaxis2={
        'categoryorder':'array',
        'categoryarray': education_df.loc["Doctoral degree"].sort_values(ascending=True).keys(),
        'domain':[0,0.83]
    },
    barmode = "stack",
    bargap = 0.05,
    showlegend = False,
    width = 700,
    height = 600,
    # plot_bgcolor = "#fff"
)

# Adding a separate subplot that will act as a color key
colorscale = ff.create_annotated_heatmap(
    z=[[1,2,3,4,5,6]],
    annotation_text = [[text 
                       for text in ["Doctoral<br>degree","Master's<br>degree","Professional<br>degree","Bachelor's<br>degree","Education<br>without<br>degree","High school<br>education"]
                      ]],
    colorscale= [
        [0.000,"#a2885e"],[0.166,"#a2885e"],
        [0.166,"#e9cf87"],[0.333,"#e9cf87"],
        [0.333,"#f1efd9"],[0.500,"#f1efd9"],
        [0.500,"#8eb3aa"],[0.666,"#8eb3aa"],
        [0.666,"#235f83"],[0.833,"#235f83"],
        [0.833,primary_blue3],[1.000,primary_blue3],
    ],
    font_colors = ["white", "white", "black", "white", "white", "white"],
    xgap = 1.5,
    showscale = False
)

fig = subplots.make_subplots(
    rows=2, 
    cols=1, 
    shared_yaxes=True, 
    shared_xaxes=False, 
    horizontal_spacing=0.02, 
    vertical_spacing=0.01
)

fig.append_trace(colorscale.data[0],1,1); 

for trace in traces:
    fig.add_trace(trace, 2, 1)

# Workaround to show annotations with ff.create_annotated_heatmap() subplots.
annot1 = list(colorscale.layout.annotations)
for k in range(len(annot1)):
    annot1[k]['xref'] = 'x'
    annot1[k]['yref'] = 'y'
fig.update_layout(annotations=annot1) 
fig.update_layout(layout)
fig.show()

In [None]:
q6 = df['Q6'].value_counts()
q6

In [None]:
fig = go.Figure(go.Pie(
    labels = q6.index,
    values = q6.values,
    hole=.3,
    # pull=[0.2, 0, 0, 0]
))
fig.update_traces(
    hoverinfo='value+percent', 
    textinfo='label+percent', 
    textfont_size=13,
    marker=dict(
        line=dict(color='#000000', width=.7)
    )
)
fig.update_layout(
    title = 'Kagglers programming experience',
)
fig.show()

In [None]:
df.iloc[:, 7:25]

In [None]:
q = {}
for i in range(1, 13):
    #Each question has either an answer or NaN
    #Each answer covers a different programming language
    #We iterate over each q, extract the language name and total value counts
    #Covert it into a dictionary and store it in q using the update function
    q.update(dict(df[f'Q7_Part_{i}'].value_counts()))
print(q)

In [None]:
q = pd.DataFrame(q.items(), columns=['language', 'total'])
q['color'] = px.colors.qualitative.Light24[:12]
q.head()

In [None]:
q = q.sort_values(['total'], ascending=False)
q.head()

In [None]:
q['language']

In [None]:
fig = go.Figure(go.Bar(
    y = q['total'],
    x = q['language'],
    text = q['total'],
    textposition="auto",
    texttemplate = "%{value:,s}",
    marker_color = q['color'],
    # orientation='h',
))
fig.update_traces(
    marker_line_color = 'black',
    marker_line_width = 1, 
    opacity = 0.9,
)
fig.update_layout(
    xaxis={'categoryorder':'total descending'}
)
fig.show()

<h2>Language student/professor

In [None]:
q7c = [column for column in df.columns if 'Q7' in column]
df_q7 = pd.DataFrame()
q7c

In [None]:
df['Q7_Part_1']

In [None]:
for column in q7c:
    #Extract language of a column - will be all matching
    #Find the number of students and professors that use it, divide it by total
    #Add them as new columns
    lang = df[column].dropna().unique()[0]
    df_q7.loc['Student', lang] = df_stud[column].notnull().sum() / len(df_stud)
    df_q7.loc['Professional', lang] = df_prof[column].notnull().sum() / len(df_prof)

In [None]:
df_q7

In [None]:
df_q7.loc['Student']

In [None]:
df_q7.T

In [None]:
fig = go.Figure()

# Two enumerations as we have two bars grouped in the graph
for i, category in enumerate(df_q7.index):
    fig.add_trace(go.Bar(
        y = df_q7.columns,
        x = df_q7.loc[category],
        orientation = 'h',
        marker = dict(color = [primary_blue2, primary_blue][i]),
        text = np.round(df_q7.loc[category] * 100),
        textposition = 'auto',
        texttemplate = "%{text}%",
        name = category,
    ))
    
fig.update_layout(
    title = 'Which languague should I Learn?',
    #Gap after title
    margin = dict(t=150),
    legend=dict(
                orientation="h",
                yanchor='top',xanchor='center',
                y= 1.06,x=0.5,
                font=dict(size= 16),
                traceorder='reversed',
               ),
    yaxis={'categoryorder':'array',
           'categoryarray': df_q7.T.sort_values('Professional', ascending=True).index
          },
    xaxis=dict(side="top",showgrid=False, tickformat="%"),
    barmode = "group",
    bargap = 0.05,
    bargroupgap =0.1,
    width = 800,
    height= 1000,
)
fig.show()

In [None]:
q8 = pd.DataFrame(df['Q8'].value_counts()).reset_index()
q8.columns = ['language', 'counts']

In [None]:
q8

In [None]:
q8 = pd.merge(q, q8, on=["language"])

In [None]:
q8

In [None]:
fig = go.Figure()
fig.add_trace(go.Bar(
    y = q8['counts'],
    x = q8['language'],
    text = q8['counts'],
    textposition="auto",
    texttemplate = "%{value:,s}",
    marker_color=q8['color']
    # orientation='h',
))
fig.update_traces(
    marker_line_color = 'black',
    marker_line_width = 1, 
    opacity = 0.9,
)
fig.update_layout(
    xaxis={'categoryorder':'total descending'}
)
fig.show()

In [None]:
q

In [None]:
q = {}
for i in range(1,11):
    q.update(dict(df[f"Q9_Part_{i}"].value_counts()))
q = pd.DataFrame(q.items(), columns = ['ide', 'counts'])
q = q.sort_values('counts')

In [None]:
q['colour'] = [primary_grey for _ in range(len(q['ide']))]
q.loc[0, 'colour'] = primary_blue
q.loc[1:3, 'colour'] = primary_blue2

fig = go.Figure(go.Bar(
    x = q['counts'],
    y = q['ide'],
    marker_color = q['colour'],
    orientation='h',
))
fig.update_traces(
    marker_line_color = 'black',
    marker_line_width = 1, 
    opacity = 0.9,
)
fig.update_layout(
    yaxis={'categoryorder':'total ascending'}
)
fig.show()

In [None]:
type(df[f"Q10_Part_1"].value_counts())

In [None]:
q = {}
for i in range(1,14):
    q.update(dict(df[f"Q10_Part_{i}"].value_counts()))
q = pd.DataFrame(q.items(), columns = ['ide', 'counts'])
q

In [None]:
q['counts'] = q['counts'] / q['counts'].sum()
q['ide'] = q['ide'].str.strip()
q

In [None]:
cloud_notebooks_hubs = ['Azure Notebooks', 'IBM Watson Studio', 'Amazon Sagemaker Studio',
                        'Amazon EMR Notebooks', 'Databricks Collaborative Notebooks', 
                        'Paperspace / Gradient', 'Code Ocean']

q.loc[len(q)] = ['Other', q[q['ide'].isin(cloud_notebooks_hubs)].sum()['counts']]

In [None]:
q = q[~q['ide'].isin(cloud_notebooks_hubs)]

In [None]:
q = q.reset_index()
q

In [None]:
q['colour'] = [primary_grey for _ in range(len(q['ide']))]
q.loc[0, 'colour'] = primary_blue
q.loc[1:2, 'colour'] = primary_blue2

fig = go.Figure(go.Bar(
    x = q['counts'],
    y = q['ide'],
    marker_color = q['colour'],
    orientation='h',
))
fig.update_traces(
    marker_line_color = 'black',
    marker_line_width = 1, 
    opacity = 0.9,
)
main_annot_format = "<span style='font-size:12px; font-family:Tahoma;'><b> %s </b><br> %s</span>"
fig.add_annotation(dict(
        x=0.65,
        y=0.22,
        xref = "paper",
        yref = "paper",
        text= main_annot_format % ("Included in others:                                      ",
                                   """1. Azure Notebooks - 3.4%                           
                                   <br>2. IBM Watson Studio - 3.3%                       
                                   <br>3. Amazon Sagemaker Studio - 1.9%            
                                   <br>4. Databricks Collaborative Notebooks - 1.6%
                                   <br>5. Amazon EMR Notebooks - 1.0%               
                                   <br>6. Paperspace / Gradient - 0.7%                  
                                   <br>7. Code Ocean - 0.4%                                """),
        ax=0, ay=0
))
fig.update_layout(
    yaxis={'categoryorder':'total ascending'},
    xaxis=dict(side="top", zerolinecolor = "#4d4d4d", zerolinewidth = 1, gridcolor="#e7e7e7",tickformat="%"),
)
fig.show()

In [None]:
q = df.groupby(['Prof_Stud', 'Q11']).agg({'Q1':'count'}).reset_index()
q.columns = ['Prof_Stud', 'Q11', 'counts']
q

In [None]:
mapper = {
    'A cloud computing platform (AWS, Azure, GCP, hosted notebooks, etc)': 'Cloud Platform',
    'A deep learning workstation (NVIDIA GTX, LambdaLabs, etc)': 'DeepLearning WS',
    'A personal computer or laptop': 'Laptop',
    'None': 'None',
    'Other': 'Other',
}

q['Q11'] = q['Q11'].replace(mapper)
fig = px.bar(q, x='Q11', y='counts', color='Prof_Stud')
fig.update_layout(barmode='group')
fig.show()

In [None]:
fig = px.sunburst(q, path=['Prof_Stud', 'Q11'], values='counts')
fig.show()

In [None]:
q12_columns = [column for column in df.columns if 'Q12' in column]
q = {}

for column in q12_columns:
    q.update(dict(df[column].value_counts()))

q = pd.DataFrame(q.items(), columns = ['hardware', 'counts'])

fig = go.Figure(go.Pie(
    labels = q['hardware'],
    values = q['counts'],
    hole=.3,
    # pull=[0.2, 0, 0, 0]
))
fig.update_traces(
    hoverinfo='value+percent', 
    textinfo='label+percent', 
    textfont_size=13,
    marker=dict(
        line=dict(color='#000000', width=.7)
    )
)
fig.update_layout(
    title = 'Kagglers hardware selection',
    yaxis={
        'categoryorder':'array',
        'categoryarray': ['GPUs', 'TPUs', 'Other', 'None']
    }
)
fig.show()