In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

import geopandas as gpd

import warnings
warnings.filterwarnings("ignore")


pd.options.display.max_columns = None

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data = pd.read_csv('/kaggle/input/kaggle-survey-2020/kaggle_survey_2020_responses.csv')
data.head()

In [None]:
data.drop(inplace=True,axis=0, index=0)

In [None]:
data.head(1)

In [None]:
data.columns

In [None]:
order = ['35-39', '30-34', '22-24', '25-29', '18-21', '55-59', '50-54',
       '40-44', '60-69', '45-49', '70+']

order.sort()

In [None]:
fig, ax = plt.subplots(2, 1, figsize=(15, 18))

sns.countplot(data=data, x='Q1', order=order, color='grey',ax=ax[0])
ax[0].grid(axis='y', color='grey', linewidth=0.5, alpha=0.3)
ax[0].annotate('There is an increasing curve from ages 18-29 and', 
             xy=(5,3500), fontsize=15)
ax[0].annotate('then declines up to some survey takers at an age', 
            xy=(5,3300), fontsize=15)
ax[0].annotate('of 70+. Ages 18-29 makes up a total of ' + str(int(round(sum(data['Q1'].value_counts().sort_index()[0:3])/sum(data['Q1'].value_counts().sort_index())*100,0))) + '% of the', 
            xy=(5,3100), fontsize=15)
ax[0].annotate('survey takers.', 
            xy=(5,2900), fontsize=15)
ax[0].set(xlabel='Age group')
ax[0].set(ylabel='No. of survey takers')

ax[0].title.set_text('What is the age group?')

sns.countplot(data=data, x='Q2', color='grey', ax=ax[1])
ax[1].grid(axis='y', color='grey', linewidth=0.5, alpha=0.3)
ax[1].annotate('There are clearly more men than women, ', 
            xy=(2,14000), fontsize=15)
ax[1].annotate('composing of ' + str(int(round(sum(data['Q2'].value_counts()[0:1])/sum(data['Q2'].value_counts())*100,0))) + '% of the total.', 
            xy=(2,13000), fontsize=15)
ax[1].set(xlabel='Gender')
ax[1].set(ylabel='No. of survey takers')

ax[1].title.set_text('How about gender?')



fig.suptitle('2020 Kaggle ML & DS Survey', fontsize = 30, c='black')
plt.show()

In [None]:
data_by_country = data['Q3'].value_counts()
data_by_country = pd.DataFrame(data_by_country)

In [None]:
countries_file = gpd.read_file('/kaggle/input/shpfile/World_Map.shp')
countries_file.plot(figsize=(15, 9))
plt.show()

In [None]:
countries_file.replace('Iran (Islamic Republic of)', 'Iran, Islamic Republic of...', inplace = True)
countries_file.replace('United States', 'United States of America', inplace = True)
countries_file.replace("Korea, Democratic People's Republic of", 'Republic of Korea', inplace = True)
countries_file.replace('Korea, Republic of', 'South Korea', inplace = True)
countries_file.replace('United Kingdom', 'United Kingdom of Great Britain and Northern Ireland', inplace = True)

In [None]:
countries_not_in_countries_shape_countries = []

countries = list(data['Q3'].unique())
countries_shape_countries = list(countries_file['NAME'])

for country in countries:
    if country not in countries_shape_countries:
        countries_not_in_countries_shape_countries.append(country)

print(countries_not_in_countries_shape_countries)

In [None]:
merge = countries_file.join(data_by_country, on = 'NAME', how = 'right')

In [None]:
fig, ax = plt.subplots(3, 1, figsize=(15, 33))

sns.countplot(data=data, x='Q1', order=order, color='grey',ax=ax[0])
ax[0].grid(axis='y', color='grey', linewidth=0.5, alpha=0.3)
ax[0].annotate('There is an increasing curve from ages 18-29 and', 
             xy=(5,3500), fontsize=15)
ax[0].annotate('then declines up to some survey takers at an age', 
            xy=(5,3300), fontsize=15)
ax[0].annotate('of 70+. Ages 18-29 makes up a total of ' + str(int(round(sum(data['Q1'].value_counts().sort_index()[0:3])/sum(data['Q1'].value_counts().sort_index())*100,0))) + '% of the', 
            xy=(5,3100), fontsize=15)
ax[0].annotate('survey takers.', 
            xy=(5,2900), fontsize=15)
ax[0].set(xlabel='Age group')
ax[0].set(ylabel='No. of survey takers')

ax[0].title.set_text('What is the age group?')

sns.countplot(data=data, x='Q2', color='grey', ax=ax[1])
ax[1].grid(axis='y', color='grey', linewidth=0.5, alpha=0.3)
ax[1].annotate('There are clearly more men than women, ', 
            xy=(2,14000), fontsize=15)
ax[1].annotate('composing of ' + str(int(round(sum(data['Q2'].value_counts()[0:1])/sum(data['Q2'].value_counts())*100,0))) + '% of the total.', 
            xy=(2,13000), fontsize=15)
ax[1].set(xlabel='Gender')
ax[1].set(ylabel='No. of survey takers')

ax[1].title.set_text('How about gender?')

ax[2] = merge.plot(column = 'Q3',
                cmap = 'Greys',
                figsize = (15,15),
                legend = True,
                scheme = 'user_defined',
                classification_kwds = {'bins': [100,200,300,400,500,600,1000,2000,6000]},
                edgecolor = 'black',
                linewidth = 0.4)

ax[2].set_title('Respondents by country',fontdict = {'fontsize':20}, pad = 12.5)
ax[2].set_axis_off()
ax[2].get_legend().set_bbox_to_anchor((0.25,0.2))
ax[2].annotate('Others not represented in the map: 1388', 
            xy=(0,-60), fontsize=15)

fig.suptitle('2020 Kaggle ML & DS Survey', fontsize = 30, c='black')
plt.show()