In [None]:
import pandas as pd
import altair as alt

import folium
from folium.plugins import MarkerCluster
from geopy.geocoders import Nominatim

import warnings
warnings.filterwarnings( 'ignore' )

In [None]:
df = pd.read_csv('../input/kaggle-survey-2021/kaggle_survey_2021_responses.csv')

In [None]:
df.head(3)

In [None]:
df.shape

In [None]:
 df.isna().sum() / len(df) * 100

# **DATA PREPRECESSING**

In [None]:
# Select just the column that i will use
df1 = df[['Time from Start to Finish (seconds)', 'Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Q6']]

In [None]:
# drop the first irrelavant row
df1.drop(0, axis=0, inplace=True)

# Rename the column that i will use
cols = {'Time from Start to Finish (seconds)':'Time_start_to_finish', 'Q1':'age', 'Q2':'gender', 'Q3':'country_reside', 
        'Q4':'formal_education', 'Q5':'title_job', 'Q6':'years_writing_code', }
df1.rename(columns=cols, inplace=True)


#Change the extense name IRAN and UK
df1['country_reside'] = df1['country_reside'].str.replace('Iran, Islamic Republic of...', 'Iran')
df1['country_reside'] = df1['country_reside'].str.replace('United Kingdom of Great Britain and Northern Ireland', 'United Kingdom')

# Creare a new column continent 
map_continent = {'India':'Asia', 'Indonesia':'Asia', 'Pakistan':'Asia', 'Mexico':'North America', 'Russia':'Asia', 'Turkey':'Asia',
                'Australia':'Australia', 'Nigeria':'Africa', 'Greece':'Europe', 'Belgium':'Europe', 'Japan':'Asia', 'Egypt':'Africa',
                'Singapore':'Asia', 'Brazil':'South America', 'Poland':'Europe', 'China':'Asia','Iran':'Asia', 'United States of America':'North America',
                'Italy':'Europe', 'Viet Nam':'Asia', 'Israel':'Asia', 'Peru':'South America', 'South Africa':'Africa', 'Other':'Other','Spain':'Europe', 'Bangladesh':'Asia',
                'United Kingdom':'Europe', 'France':'Europe','Switzerland':'Europe', 'Algeria':'Africa', 'Tunisia':'Africa', 'Argentina':'South America', 'Sweden':'Europe',
                'Colombia':'South America','I do not wish to disclose my location':'I do not wish to disclose my location', 'Canada':'North America','Chile':'South America', 
                'Netherlands':'Europe', 'Ukraine':'Europe', 'Saudi Arabia':'Asia', 'Romania':'Europe','Morocco':'Africa', 'Austria':'Europe', 'Taiwan':'Asia', 'Kenya':'Africa', 'Belarus':'Europe', 
                'Ireland':'Europe','Portugal':'Europe', 'Hong Kong (S.A.R.)':'Asia', 'Denmark':'Europe', 'Germany':'Europe','South Korea':'Asia', 'Philippines':'Asia', 'Sri Lanka':'Asia', 
                'United Arab Emirates':'Asia','Uganda':'Africa', 'Ghana':'Africa', 'Malaysia':'Asia', 'Thailand':'Asia', 'Nepal':'Asia', 'Kazakhstan':'Asia','Ethiopia':'Africa', 'Iraq':'Asia', 
                'Ecuador':'South America', 'Norway':'Europe', 'Czech Republic':'Europe'}

df1['continent'] = df1['country_reside'].map(map_continent)

# **EXPLORATORY DATA ANALYSIS (EDA)**

In [None]:
df1.sample()

## **AGE**

In [None]:
df_age = df1.groupby('age', as_index=False)['gender'].count().rename( columns = {"gender":'total'})
df_age = pd.DataFrame(df_age)

In [None]:
alt.Chart(df_age).mark_bar().encode(
    x='age:N',
    y='total:Q').properties(
    title = 'AGE OF PEOPLE IN KAGLLE COMUNITY',
    width=600,
    height=400,

).configure_title(
    fontSize=20,
    font='Arial',
    color='#264653',
    align='right',
).configure_axis(
    labelFontSize=12,
)

## **FORMAL EDUCATION AND GENDER**

In [None]:
mask = (df1['gender'] != 'Prefer not to say') & (df1['formal_education'] != 'I prefer not to answer')
total_formal =  df1.loc[mask].groupby(['gender','formal_education'], as_index=False)['age'].count().rename( columns = {'age':'total'})

In [None]:
ax = alt.Chart(total_formal).mark_bar(opacity=0.8).encode(
     x='total:Q',
    y=alt.Y('gender:N', sort='-x'),
    color='formal_education',
    row='formal_education:N'
    ).properties(
    title='GENDER AND FORMAL EDUCATION',
    width=300,
    height=300,
    background='#e9ecef',
)
ax.configure_title(
    fontSize=25,
    font='Arial',
    color='black',
    align='center',
).configure_axis(
    labelFontSize=15,
)

## **TITLE JOB**

In [None]:
title_job =  df1.groupby(['title_job'], as_index=False)['age'].count().rename( columns = {'age':'total'})

In [None]:
alt.Chart(title_job).mark_bar().encode(
    x=alt.X('title_job:N', sort='-y'),
    y=alt.Y('total:Q'),
    color='total:Q',
    
    ).properties(
    title = 'TITLE JOB IN KAGGLE COMUNITY',
    width=600,
    height=300,

).configure_title(
    fontSize=15,
    font='Arial',
    color='#264653',
    align='right',
).configure_axis(
    labelFontSize=12,
)

## **YEARS EXPERIENCE**

In [None]:
years_exp =  df1.groupby(['years_writing_code'], as_index=False)['age'].count().rename( columns = {'age':'total'})

In [None]:
alt.Chart(years_exp).mark_bar().encode(
    x=alt.X('years_writing_code:N', sort='-y'),
    y=alt.Y('total:Q'),
    color='total:Q',
    
    ).properties(
    title = 'YEARS OF EXPERIENCE IN KAGGLE COMUNITY',
    width=600,
    height=300,

).configure_title(
    fontSize=15,
    font='Arial',
    color='#264653',
    align='right',
).configure_axis(
    labelFontSize=12,
)

## **COUNTRY**

In [None]:
df_country = df1.groupby('country_reside', as_index=False)['gender'].count().rename( columns = {"gender":'total'})
df_country = pd.DataFrame(df_country)
df_country_10 = df_country.nlargest(10, 'total')

In [None]:
alt.Chart(df_country_10).mark_bar().encode(
     x='total:Q',
    y=alt.Y('country_reside:N', sort='-x'),
    color='total'

    ).properties(
    title='TOTAL BY COUNTRY',
    width=600,
    height=300,

).configure_title(
    fontSize=15,
    font='Arial',
    color='black',
    align='right',

).configure_axis(
    labelFontSize=12,
)

## **CONTINENT**

### **Some Macro Analysis**

In [None]:
mask = (df1['continent'] != 'Other') & (df1['continent'] != 'I do not wish to disclose my location')
df1_contnent = df1.loc[mask].groupby('continent', as_index=False)['age'].count().rename( columns = {'age':'total'})

In [None]:
ax = alt.Chart(df1_contnent).mark_bar().encode(
     x='total:Q',
    y=alt.Y('continent:N', sort='-x'),
    color='total',
    ).properties(
    title='CONTINENT IN KAGGLE COMUNITY',
    width=600,
    height=300,
    background='#e9ecef',
)
ax.configure_title(
    fontSize=15,
    font='Arial',
    color='black',
    align='right',
).configure_axis(
    labelFontSize=12,
)

**THE MAPS BELOW ARE INTERACTIVE. SELECT THE COLOR OF LEGEND TO HIGHLIGHT**

### **Age by continent**

In [None]:
mask = (df1['continent'] != 'Other') & (df1['continent'] != 'I do not wish to disclose my location')
aux2 = df1.loc[mask].groupby(['continent', 'age'], as_index=False)['gender'].count().rename( columns = {'gender':'total'})

In [None]:
selectage = alt.selection_multi(fields=['age'], bind='legend')

ax = alt.Chart(aux2).mark_bar(opacity=0.7).encode(
    x='total:Q',
    y=alt.Y('continent:N', sort='-x'),
    color='age',
    opacity=alt.condition(selectage, alt.value(1), alt.value(0.2))
    ).properties(
    title='AGE BY CONTINENT',
    width=600,
    height=300,
    background='#e9ecef',
)
ax.configure_title(
    fontSize=15,
    font='Arial',
    color='black',
    align='right',
).configure_axis(
    labelFontSize=12,
)

ax.add_selection(
    selectage
)


### **Gender by Continent**

In [None]:
mask = (df1['continent'] != 'Other') & (df1['continent'] != 'I do not wish to disclose my location')
aux3 = df1.loc[mask].groupby(['continent', 'gender'], as_index=False)['age'].count().rename( columns = {'age':'total'})

In [None]:
select = alt.selection_multi(fields=['gender'], bind='legend')

ax1 = alt.Chart(aux3).mark_bar(opacity=0.7).encode(
    x='total:Q',
    y=alt.Y('continent:N', sort='-x'),
    color='gender',
    opacity=alt.condition(select, alt.value(1), alt.value(0.2))
    ).properties(
    title='GENDER BY CONTINENT',
    width=600,
    height=300,
    background='#e9ecef',
)
ax1.configure_title(
    fontSize=15,
    font='Arial',
    color='black',
    align='right',
).configure_axis(
    labelFontSize=12,
)

ax1.add_selection(
    select
)

In [None]:
mask = (df1['continent'] != 'Other') & (df1['continent'] != 'I do not wish to disclose my location')
aux4 = df1.loc[mask].groupby(['continent', 'formal_education'], as_index=False)['age'].count().rename( columns = {'age':'total'})

In [None]:
selections = alt.selection_multi(fields=['formal_education'], bind='legend')

ax = alt.Chart(aux4).mark_bar(opacity=0.7).encode(
    x='total:Q',
    y=alt.Y('continent:N', sort='-x'),
    color='formal_education',
    opacity=alt.condition(selections, alt.value(1), alt.value(0.2))
    ).properties(
    title='FORMAL EDUCATION BY CONTINENT',
    width=600,
    height=300,
    background='#e9ecef',
)
ax.configure_title(
    fontSize=15,
    font='Arial',
    color='black',
    align='right',
).configure_axis(
    labelFontSize=12,
).add_selection(
    selections
)

# **ASIA MASTERS THE KAGGLE**  LOL

# **MAP**

In [None]:
df2 = df_country.loc[(df_country['country_reside'] != 'I do not wish to disclose my location')].copy()
df2['country_reside'] = df2['country_reside'].replace('Hong Kong (S.A.R.)', 'Hong Kong')

In [None]:
geolocator = Nominatim(user_agent='my_email')
def get_latitude(country):
    try:
        # Geolocate the center of the country
        loc = geolocator.geocode(country)
        # And return latitude and longitude
        return loc.latitude
    except:
        # Return missing value
        return np.nan
  
def getlongitude(country):
    try:
        # Geolocate the center of the country
        loc = geolocator.geocode(country)
        # And return latitude and longitude
        return loc.longitude
    except:
        # Return missing value
        return np.nan

In [None]:
df2['lat'] = df2['country_reside'].apply(lambda x: get_latitude(x))
df2['lon'] = df2['country_reside'].apply(lambda x: getlongitude(x))

In [None]:
selection = alt.selection_multi(fields=['country_reside'], bind='legend')

aux10 = alt.Chart(df_country_10).mark_bar().encode(
     x=alt.X('total:Q'),
    y=alt.Y('country_reside:N', sort='-x'),
    color='country_reside',
    opacity=alt.condition(selection, alt.value(1), alt.value(0.2))

    ).properties(
    title='TOTAL ANSWERS BY COUNTRY',

).configure_axis(
    labelFontSize=12,
).add_selection(
    selection
)

In [None]:
# This map was inspired by this beautiful notebook: https://www.kaggle.com/ruchi798/kaggle-ml-ds-survey-analysis

world_map = folium.Map(tiles='cartodbpositron')
marker_cluster = MarkerCluster().add_to(world_map)

for i, row in df2.iterrows():
        lat = row['lat']
        lon = row['lon']
        
        popup_chart=folium.Popup(max_width=600).add_child(folium.VegaLite(aux10, width=500, height=250))
        iframe = folium.IFrame(popup_chart)
        popup = folium.Popup(iframe)

        folium.CircleMarker(location = [lat, lon], 
                            popup = popup_chart, 
                            fill =True).add_to(marker_cluster)     

world_map