In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Read Data

In [None]:
df = pd.read_csv('../input/kaggle-survey-2021/kaggle_survey_2021_responses.csv',header=1)

In [None]:
df.head()

## Country participated in Kaggle Survey

In [None]:
df['In which country do you currently reside?'].unique()

## Renaming Country names and few feature details

In [None]:
df=df.replace(['United States of America'],'USA')
df=df.replace(['United Kingdom of Great Britain and Northern Ireland'],'UK')
df=df.replace(['Iran, Islamic Republic of...'],'Iran')
df=df.replace(['No formal education past high school'],'High_School')
df=df.replace(['Some college/university study without earning a bachelor’s degree'],'Private_College')
df=df.replace(['I prefer not to answer'],'Prefer_not_answer')
df=df.replace(['A cloud computing platform (AWS, Azure, GCP, hosted notebooks, etc)'],'Cloud_Platform')
df=df.replace(['A deep learning workstation (NVIDIA GTX, LambdaLabs, etc)'],'GPU')
df=df.replace(['A personal computer / desktop'],'PC/Desktop')
df=df.replace(['Cloud-certification programs (direct from AWS, Azure, GCP, or similar)'],'Certification_Program')
df=df.replace(['University Courses (resulting in a university degree)'],'University_Course')
df=df.replace(['Podcasts (Chai Time Data Science, O’Reilly Data Show, etc)'],'Podcasts')

In [None]:
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns
colors = ["#f94144","#f3722c","#f8961e","#f9844a","#f9c74f","#90be6d","#43aa8b","#4d908e","#577590","#277da1"]
sns.set(palette=colors,font='Serif', style='white', rc={'axes.facecolor':'#C1BFB5', 'figure.facecolor':'#C1BFB5', 'figure.figsize':(16,8)})
sns.palplot(colors, size=2)


In [None]:
px.choropleth(data_frame=df, locations='In which country do you currently reside?', locationmode='country names',color='In which country do you currently reside?', title='Countries participated in Kaggle Survey')

In [None]:
sns.countplot(data=df, x='In which country do you currently reside?', order=df.groupby(by=['In which country do you currently reside?']).count().sort_values(by='Duration (in seconds)', ascending=False).head(15).index,palette=colors )
plt.annotate(text='India is in top rank in Survey Response', xytext=(1, 6000), xy=(0,5500), arrowprops=dict(arrowstyle='->', color='black'))
plt.title('Kaggle Survey response by top 15 Countries', fontsize=20)
sns.despine()

In [None]:
df_t= pd.read_csv('../input/top-1000-kagglers/top_1000_competitions_2021_11.csv')

In [None]:

ax=sns.countplot(data=df_t, x='country',order=df_t.groupby(by='country').count().sort_values(by='rank', ascending=False).head(10).index)
ax.annotate(text='India is at 4th position in over all ranking',xytext=(3,100 ), arrowprops=dict(arrowstyle='->', color='black'), xy=(3,50), fontsize=15)
ax.set_title('Countires Rank by top 1000 Kagglers (Competition Ranking)', fontsize=20)
sns.despine()

## Top 15 countries survey data as sample

In [None]:
top_countries = df['In which country do you currently reside?'].value_counts().to_frame().reset_index().head(15)['index']
df1=df[df['In which country do you currently reside?'].isin(top_countries)]
plt.pie(x=[len(df)-len(df1),len(df1)], labels=['DataFrame length','Top 15 Countries length'], explode=(0.1,0), autopct='%.0f%%');
plt.title('Sampling from dataset with Top 15 countries', fontsize=20)

***Top 15 countires data covers almost 71% of overall data set data, so, let us use this as sample data for further analysis***

# Gender

In [None]:
plt.pie(data=df1['What is your gender? - Selected Choice'].value_counts().to_frame().reset_index(), x='What is your gender? - Selected Choice', labels='index', autopct='%.2f%%');
plt.title('Survey response by Gender', fontsize=20)

***From the sample dataset, 78% of survey contribution is from male and 19% is from Female.***

In [None]:
fig=plt.figure(figsize=(16,8))
sns.displot(df1,x='In which country do you currently reside?', hue='What is your gender? - Selected Choice', multiple='stack' )
plt.title('Survey response by Gender by Country', fontsize=20)

# Duration

In [None]:
type(df1.groupby(by='What is your gender? - Selected Choice').mean()['Duration (in seconds)'])

In [None]:
ax=sns.kdeplot(data=df1, x='Duration (in seconds)', hue='What is your gender? - Selected Choice')
ax.annotate(xy=(600,50), text='Statics of Duration feature by Gender : \n\n{}'.format(df1.groupby(by='What is your gender? - Selected Choice').mean()['Duration (in seconds)']), xycoords='axes points')


In [None]:
ax=sns.boxplot(data=df1, x='Duration (in seconds)', y='What is your gender? - Selected Choice')
ax.set_xlim(0,10000)
ax.annotate(xy=(4000,4), xytext=(4000,4), text='Distribution of Durtion for survey is between 0 to 2000 sec ', weight='bold', 
            bbox=dict(boxstyle="round4", fc="w"))
ax.set_title('Duration by Gender', fontsize=20)
sns.despine()

In [None]:
ax=sns.boxplot(data=df1, x='Duration (in seconds)', y='In which country do you currently reside?', palette=colors)
ax.set_xlim(0,4000)
ax.set_title("Distribution time taken to complete the survey", fontsize=20)
sns.despine()

***Except Nigeria, most countries durtion on Survey response is between 0 to 2000 seconds), There are outliers based on the dataset***

# Age

In [None]:
from pandas.api.types import CategoricalDtype
age_order=CategoricalDtype(['18-21','22-24','25-29','30-34','35-39','40-44','45-49','50-54','55-59','60-64','65-69','70+'], ordered=True)

df1['What is your age (# years)?']=df1['What is your age (# years)?'].astype(age_order)

In [None]:
ax=sns.countplot(data=df1, x='What is your age (# years)?')
ax.annotate(xytext=(1,4000), xy=(0,3000), text='Highest number of Kaggler are between 18-21', arrowprops=dict(arrowstyle='->', connectionstyle="angle3", color='black'))
ax.set_title('Kagglers by Age Category', fontsize=20)
sns.despine()

In [None]:
ax=sns.countplot(data=df1, hue='What is your age (# years)?', x='In which country do you currently reside?')
ax.set_title('Kagglers by Age Category by Countries', fontsize=20)
sns.despine()

In [None]:
ax=sns.countplot(data=df1, hue='What is your age (# years)?', x='What is your gender? - Selected Choice')
ax.set_title('Kagglers by Age Category by Gender', fontsize=20)
sns.despine()

In [None]:
ax=sns.boxplot(data=df1, x='Duration (in seconds)', y='What is your age (# years)?', palette=colors)
ax.set_xlim(0,4000)
ax.set_title("Distribution time taken to complete the survey by Age category", fontsize=20)
sns.despine()

# Programming Lanaguage

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
#pd.set_option('max_colwidth', -1)
#df1.iloc[:,7:16].count()
x_val=['Python','R','SQL','C','C++','Java','Javascript','Julia','Swift']


In [None]:
ax=sns.barplot(data=df1.iloc[:,7:16].count().reset_index().sort_values(by=[0], ascending=False), x='index', y=0)
ax.set_xticklabels(labels=x_val);
ax.set_title('Most prefered Programming language', fontsize=20)
sns.despine()

In [None]:
fig,ax=plt.subplots(nrows=3, ncols=5, figsize=(16,16), constrained_layout=True)
ax=ax.flatten()
plt.suptitle('Programming language prefernce by Country', fontsize=20)
for x,i in enumerate(top_countries):
    ax[x]=sns.barplot(data=df1[df1['In which country do you currently reside?']==i].iloc[:,7:16].count().reset_index().sort_values(by=[0], ascending=False), x='index', y=0, ax=ax[x])
    ax[x].set_xticklabels(labels=x_val,rotation=90 )
    ax[x].set_xlabel('')
    ax[x].set_ylabel('')
    ax[x].set_title(i, fontsize=15)
sns.despine()


    

***In all top responded contries the preference for programming language order similar. Python ranks first and followed by R, SQL C, C++ etc. Interestingly Java, JS are the least prefered when it comes to Datascience/Machine learning***

In [None]:
fig,ax=plt.subplots(nrows=2, ncols=6, figsize=(16,16), constrained_layout=True)
ax=ax.flatten()
plt.suptitle('Programming language prefernce by Age Category', fontsize=20)
for x,i in enumerate(df1['What is your age (# years)?'].value_counts().reset_index()['index']):
    ax[x]=sns.barplot(data=df1[df1['What is your age (# years)?']==i].iloc[:,7:16].count().reset_index().sort_values(by=[0], ascending=False), x='index', y=0, ax=ax[x])
    ax[x].set_xticklabels(labels=x_val,rotation=90 )
    ax[x].set_xlabel('')
    ax[x].set_ylabel('')
    ax[x].set_title(i, fontsize=15)
sns.despine()

***Surprisingly Python is the favourite programming language for all age group people***

In [None]:
fig,ax=plt.subplots(nrows=2, ncols=3, figsize=(16,16), constrained_layout=True)
ax=ax.flatten()
plt.suptitle('Programming language prefernce by Gender', fontsize=20)
for x,i in enumerate(df1['What is your gender? - Selected Choice'].unique()):
    ax[x]=sns.barplot(data=df1[df1['What is your gender? - Selected Choice']==i].iloc[:,7:16].count().reset_index().sort_values(by=[0], ascending=False), x='index', y=0, ax=ax[x])
    ax[x].set_xticklabels(labels=x_val,rotation=90 )
    ax[x].set_xlabel('')
    ax[x].set_ylabel('')
    ax[x].set_title(i, fontsize=15)
sns.despine()

***Again Python is the prefered programming Language for all Genders***

# Most Recommended Progamming language

In [None]:
pie_df=df1['What programming language would you recommend an aspiring data scientist to learn first? - Selected Choice'].value_counts().reset_index()

In [None]:
plt.pie(data=pie_df,x='What programming language would you recommend an aspiring data scientist to learn first? - Selected Choice', labels='index', autopct='%.2f%%'
       , explode=[0.2,0,0,0,0,0,0,0,0,0,0,0,0]);
plt.title('Most recommended programming language', fontsize=20)