In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import re
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
SEA_OC = ('Australia', 'Indonesia', 'Singapore', 'Malaysia', 'Viet Nam', 'Philippines',  'Thailand')
# Loads csv
df = pd.read_csv("../input/kaggle-survey-2021/kaggle_survey_2021_responses.csv")

# Save Questions Header and Definition as a dictionary
QUESTIONS_DICT = dict(zip(list(df),df.iloc[[0]].values.flatten().tolist()))

# Removes the questions definitions 
df.drop(0, inplace=True)

seaoc_df = df[df.iloc[:,3].isin(SEA_OC)]
seaoc_df.head()

# Kaggle Survey 2021 EDA : Southeast Asia + Australia

In [None]:
seaoc_df.iloc[:,3].value_counts().plot(kind='bar')

**Q4 : Education Level**

* Bachelor's vs Master's
    
Majority of Kagglers in SEA & OC holds Bachelor's and Master's degree\
In Australia and Thailand, there are more Masters graduate than Bachelors (with less than 15% difference)\
Meanwhile in other SEA countries, there are significantly more Bachelors (compared to Masters)
    
    
* Doctoral
 
About more than 10% Holds Doctoral degree in Australia, Malaysia, and Singapore
For the others, there are less

In [None]:
# Create of education level in each countries
edulvl_df = seaoc_df.groupby(['Q3', 'Q4']).size().unstack(fill_value=0).stack().reset_index()
edulvl_df.rename({0:'Sum'}, axis=1, inplace=True)
EDUCATION_LEVEL = edulvl_df['Q4'][:7]

fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(15,8))

artists = []

fig.suptitle('Education Level of Kagglers from SEA & Oceania 2021', fontsize=20)
for ax, country in zip(axes.flat, SEA_OC):
    artists = ax.pie(edulvl_df[edulvl_df['Q3']==country]['Sum'],autopct=lambda pct:('%.2f%%' % pct) if pct > 5 else '')
    ax.set(ylabel='', title=country, aspect='equal')
    
axes[-1][-1].axis('off')
fig.legend(artists[0], EDUCATION_LEVEL, bbox_to_anchor=(0.875,0.325), loc='center')


# Title

In [None]:
# Normalizer
def normalize(row):
    tot = np.sum(row)
    
    for i, r in enumerate(row):
        row[i] = row[i]/tot
    
    return row

**Professions**

Less than 20% of Kagglers in Australia and Singapore are students\
5-10% are unemployed

In [None]:
# Create pivot table Country x Profession
temp_df = seaoc_df[['Q3','Q5']].pivot_table(index=['Q3'], columns=['Q5'], aggfunc=lambda x:len(x), fill_value=0)
temp_df = temp_df.astype(float).apply(lambda row : normalize(row), axis=1)

# Rearrange columns
cols = temp_df.columns.tolist()
    
new_cols = [cols[1], cols[8], cols[12], cols[2], cols[4], cols[7], cols[5], cols[11], cols[13], cols[3],
            cols[6], cols[0], cols[9], cols[10], cols[-1]]

temp_df = temp_df[new_cols]

# Rearrange indexes
temp_df = temp_df.reindex(('Australia', 'Singapore', 'Indonesia', 'Malaysia', 'Viet Nam', 'Philippines',  'Thailand'))

# Plot as Heatmap
fig, ax = plt.subplots(figsize=(15,5))
sns.heatmap(temp_df, annot=True, ax=ax, fmt='.2f', cmap='YlGnBu')
ax.set_title('Profession Proportion in SEA & Oceania Countries 2021', pad=30, fontsize=20)
plt.xticks(rotation=60)

**Q6 Coding Experience**

* Most of Kagglers in SEA are quite new in coding (<5 years of experience), however in Australia and Singapore they are more distributed, especially as there are 22% Australian Kagglers with >20 yrs of coding experience!

In [None]:
# Create pivot table
temp_df = seaoc_df[['Q3','Q6']].pivot_table(index=['Q3'], columns=['Q6'], aggfunc=lambda x:len(x), fill_value=0)
temp_df = temp_df.astype(float).apply(lambda row : normalize(row), axis=1)

# Rearrange columns
cols = temp_df.columns.tolist()
    
new_cols = [cols[-1], cols[-2], cols[0]] + cols[3:5]+ cols[1:3]


temp_df = temp_df[new_cols]

# Rearrange indexes
temp_df = temp_df.reindex(('Australia', 'Singapore', 'Indonesia', 'Malaysia', 'Viet Nam', 'Philippines',  'Thailand'))

# Plot as Heatmap
fig, ax = plt.subplots(figsize=(7,5))
sns.heatmap(temp_df, annot=True, ax=ax, fmt='.2f', cmap='YlGnBu')
ax.set_title('Coding Experience Proportion in SEA & Oceania Countries 2021', pad=20)
plt.xticks(rotation=60)
None

**Q20: Industry**

* Mostly come from Academic and Tech Industry
* Thailand and Australian Kagglers' domain are more distributed

In [None]:
# Create pivot table
temp_df = seaoc_df[['Q3','Q20']].pivot_table(index=['Q3'], columns=['Q20'], aggfunc=lambda x:len(x), fill_value=0)
temp_df = temp_df.astype(float).apply(lambda row : normalize(row), axis=1)

# Rearrange columns
cols = temp_df.columns.tolist()
    
new_cols = cols

temp_df = temp_df[new_cols]

# Rearrange indexes
temp_df = temp_df.reindex(('Australia', 'Singapore', 'Indonesia', 'Malaysia', 'Viet Nam', 'Philippines',  'Thailand'))

# Plot as Heatmap
fig, ax = plt.subplots(figsize=(15,5))
sns.heatmap(temp_df, annot=True, ax=ax, fmt='.2f', cmap='YlGnBu')
ax.set_title(QUESTIONS_DICT['Q20'], pad=20)
plt.xticks(rotation=90)
None

**Q21: Company Size**

* For Indonesia and Vietnam, they are mostly employed in companies with small-midsized <250 employees  
* For the others, they are split between small sized (0-49) and large sized (1000-9999)

In [None]:
# Create pivot table
temp_df = seaoc_df[['Q3','Q21']].pivot_table(index=['Q3'], columns=['Q21'], aggfunc=lambda x:len(x), fill_value=0)
temp_df = temp_df.astype(float).apply(lambda row : normalize(row), axis=1)

# Rearrange columns
cols = temp_df.columns.tolist()
new_cols = [cols[0], cols[-1], cols[-2], cols[2], cols[1]]
temp_df = temp_df[new_cols]

# Rearrange indexes
temp_df = temp_df.reindex(('Australia', 'Singapore', 'Indonesia', 'Malaysia', 'Viet Nam', 'Philippines',  'Thailand'))

# Plot as Heatmap
fig, ax = plt.subplots(figsize=(15,5))
sns.heatmap(temp_df, annot=True, ax=ax, fmt='.2f', cmap='YlGnBu')
ax.set_title(QUESTIONS_DICT['Q21'], pad=20)
plt.xticks(rotation=40)
None

**Q22: Data Science Team Size**

* 37% of Singaporean respondents worh in a big Data Science team
* For the others, they work in smaller teams

In [None]:
# Create pivot table
temp_df = seaoc_df[['Q3','Q22']].pivot_table(index=['Q3'], columns=['Q22'], aggfunc=lambda x:len(x), fill_value=0)
temp_df = temp_df.astype(float).apply(lambda row : normalize(row), axis=1)

# Rearrange columns
cols = temp_df.columns.tolist()
new_cols = cols[:2]+cols[5:]+cols[2:5]
temp_df = temp_df[new_cols]

# Rearrange indexes
temp_df = temp_df.reindex(('Australia', 'Singapore', 'Indonesia', 'Malaysia', 'Viet Nam', 'Philippines',  'Thailand'))

# Plot as Heatmap
fig, ax = plt.subplots(figsize=(15,5))
sns.heatmap(temp_df, annot=True, ax=ax, fmt='.2f', cmap='YlGnBu')
ax.set_title(QUESTIONS_DICT['Q22'], pad=20)
plt.xticks(rotation=0)
None

In [None]:
interval = seaoc_df.groupby('Q25').size()
interval = interval.index.tolist()
# interval.sort()
interval