# The challenge objective:

### To tell a data story about a subset of the data science community represented in this survey, through a combination of both narrative text and data exploration.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 

# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!/opt/conda/bin/python3.7 -m pip install --upgrade pip

In [None]:
# Supress Warnings
import warnings
warnings.filterwarnings('ignore')

# Import the NumPy and Pandas libraries
import numpy as np
import pandas as pd

# Display all the columns in dataset 
from IPython.display import display 
pd.options.display.max_columns = None

# Import libraries to visualize the dataset
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns

# Import interactive Shell
from IPython.core.interactiveshell import InteractiveShell 
InteractiveShell.ast_node_interactivity = "all"

# Import Profiling package of Pandas
import pandas_profiling


-   **Blue Alert Box: Inference**

<div class="alert alert-block alert-info">
<b>Tip:</b> Use blue boxes for inferences, insights and notes. 
</div>

-   **Yellow Alert Box: Add**

<div class="alert alert-block alert-warning">  
<b>Example:</b> Yellow Boxes are generally used to include additional examples or mathematical formulas.  
</div>

-   **Green Alert Box: Info**

<div class="alert alert-block alert-success">  
Link: Use green box only when necessary such as to display web links to related content.  
</div>

-   **Red Alert Box: Warning**

<div class="alert alert-block alert-danger">  
Alert: It is good to avoid red boxes but can be used to alert users to not delete some important part of code etc.   
</div>

In [None]:
# Read and display file

df = pd.read_csv('../input/kaggle-survey-2020/kaggle_survey_2020_responses.csv') # load data

df.head() # check top 

In [None]:
# Remove top unmarked row and inserting original header on top

header = df.iloc[0] # initiate 'header' for dataset

df = df[1:] # slice original dataset to remove unmarked row and original header

df.columns = header # re-insert columns as 'header'

df.head() # check top five rows of dataframe


In [None]:
df.tail() # check bottom five rows

In [None]:
# Count countries with percentage data science enthusiasts

df['In which country do you currently reside?'].value_counts(normalize=True)*100 # percentage value count

## Our analysis will be based on the Kaggle survey information in the subset for India and USA.

<div class="alert alert-block alert-info">
<b>Tip:</b> It is evident from the overall analysis that India and USA are the countries with most Data Science Enthusiasts (more than 40%).
</div>


In [None]:
# Create and display new dataset for survey data from kaggle users in India

df_Ind = df[df['In which country do you currently reside?']=='India'] # filter a subset for India
df_Ind.head()

In [None]:
# Create and display new dataset for survey data from kaggle users in USA

df_USA = df[df['In which country do you currently reside?']=='United States of America'] # filter a subset for USA
df_USA.head()

In [None]:
df_Ind.shape # row and column number in India subset

In [None]:
df_USA.shape # row and column in USA subset

In [None]:
# Find null values in each column

df_Ind.isna().sum()

In [None]:
# Find null values in each column

df_USA.isna().sum()

In [None]:
# Drop column with null values more than 3000 or 60% of total values in a column

for i in df_Ind.columns:
    if df_Ind[i].isna().sum()>3000:
        df_Ind.drop(i,1,inplace=True)

df_Ind.head()

In [None]:
# Drop column with null values more than 1200 or 60% of total values in a column

for i in df_USA.columns:
    if df_USA[i].isna().sum()>1200:
        df_USA.drop(i,1,inplace=True)

df_USA.head()

In [None]:
# check shape of dataframe

print('No. of rows:', df_Ind.shape[0]) # print no. of rows
print('No. of columns:', df_Ind.shape[1]) # print no. of columns

In [None]:
# check shape of dataframe

print('No. of rows:', df_USA.shape[0]) # print no. of rows
print('No. of columns:', df_USA.shape[1]) # print no. of columns

In [None]:
# Check statistical information for each column

df_Ind.describe(include='all') # describe India subset

### The description of subset for India is as follows:

<div class="alert alert-block alert-info">
<b>Inferences:</b>
    1. Most of the survey respondents from India are male student in age group of 18-21 with Bachelor's degree.
    2. Python is most used, recommended language and matplotlib most used visualization library for data science on a PC. 
    3. Jupyter notebook IDE users in India have most frequent experience of coding in python within 1-2 year bracket.
    4. Seaborn is second most used visualization library in python for exploratory data analysis.
    5. Machine Learning is a growing field in data science with most frequent users of ML algorithm are under 1 year.
    6. SKLearn is the most used frame work, Linear or Logistic regression the frequently used algorithm in ML.
    7. Local development environment such as R Studio and Jupyter is the primary tool at work and school to analyze data.
   
</div>


In [None]:
# Check statistical information for each column

df_USA.describe(include='all') # describe USA subset

### The description of subset for USA is as follows:

<div class="alert alert-block alert-info">
<b>Inferences:</b>
    1. Most of the survey respondents from USA are male Data Scientists in age group of 30-34 with Master's degree.
    2. Python, SQL is most used, recommended language and matplotlib most used visualization library on a PC for DS. 
    3. Jupyter notebook IDE users in USA have most frequent experience of coding in python within 3-5 year bracket.
    4. Companies with 10k and more employees has most no. of surveyors on kaggle who work in a group of 20+ members.
    5. Machine Learning is a growing field in data science with most frequent users of ML algorithm are under 1 year.
    6. SKLearn is the most used frame work used in ML at United States of America.
    7. Local development environment such as R Studio and Jupyter is the primary tool at work and school to analyze data.
    8. Linear or Logistic regression, Decision Tree or Random forest are frequently used ML algorithm.
    9. Compensation for survey respondents in USA is highest for a range of 100k to 125k USD.
    10.There is a lot of scope to invest in Data Science courses in USA as most of the respondents has not spent anything on DS in last 5 years.
   
</div>


In [None]:
# Rename column names

df_Ind.rename(columns={'What is your age (# years)?':'Age','What is your gender? - Selected Choice':'Gender', 
                       'In which country do you currently reside?':'Country', 
                       'What is the highest level of formal education that you have attained or plan to attain within the next 2 years?':'Highest Education',
                       'Select the title most similar to your current role (or most recent title if retired): - Selected Choice': 'Job Title',
                       'For how many years have you been writing code and/or programming?': 'Coding Experience',
                       "What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - Python" : 'Language Used',
                       'What programming language would you recommend an aspiring data scientist to learn first? - Selected Choice':'Language Recommended',
                       "Which of the following integrated development environments (IDE's) do you use on a regular basis? (Select all that apply) - Selected Choice - Jupyter (JupyterLab, Jupyter Notebooks, etc)" : "IDE Used",
                       "What type of computing platform do you use most often for your data science projects? - Selected Choice":"Platform Used",
                       "Approximately how many times have you used a TPU (tensor processing unit)?":"TPU Used",
                       "What data visualization libraries or tools do you use on a regular basis? (Select all that apply) - Selected Choice - Matplotlib" : "Matplotlib",
                       "What data visualization libraries or tools do you use on a regular basis? (Select all that apply) - Selected Choice - Seaborn" : "Seaborn",
                       "For how many years have you used machine learning methods?":"ML Usage",
                       "Which of the following machine learning frameworks do you use on a regular basis? (Select all that apply) - Selected Choice - Scikit-learn" : "sklearn",
                       "Which of the following ML algorithms do you use on a regular basis? (Select all that apply): - Selected Choice - Linear or Logistic Regression":"Regression",
                       "What is the primary tool that you use at work or school to analyze data? (Include text response) - Selected Choice":"Primary Tool"})

In [None]:
# import profiling in pandas
import pandas_profiling

df_Ind.profile_report() # profile report for subset of India 

### The Profile Report of subset for India:

<div class="alert alert-block alert-info">
<b>Tip:</b> The profile report is self explainatory for each variable in the subset.   
</div>


In [None]:
df_USA.profile_report()

### The Profile Report of subset for USA:

<div class="alert alert-block alert-info">
<b>Tip:</b> The profile report is self explainatory for each variable in the subset.   
</div>


In [None]:
# Check name of columns in subset for India

df_Ind.columns

In [None]:
# Check name of columns in subset for USA

df_USA.columns

In [None]:
# Combine both the subsets to form a new data set for comaprison

df1 = pd.concat([df_Ind, df_USA], ignore_index=True) # concatenate India, USA subset in a new Data Frame
df1.head()

In [None]:
# Check no. of columns and rows in new data frame

df1.shape

In [None]:
# Drop rows with high NaN values 

for i in df1.columns: # for loop in new concatenated data frame
    if df1[i].isna().sum()>2000: # apply condition
        df1.drop(i,1,inplace=True) # drop columns as per condition

df1.head() # display top rows of newly formed data frame

In [None]:
# Check no. of rows and columns after dropping columns with high NaNs

df1.shape


<div class="alert alert-block alert-info">
<b>Tip:</b> Number of columns has reduced from 26 to 12. Out of it only 11 are useful for further analysis.   
</div>



In [None]:
# Check basic information such as data type, non-null count, rows and columns

df1.info()


<div class="alert alert-block alert-info">
<b>Tip:</b> As all the columns are 'object' data type only bar graph or count plot can be build to compare and analyze the data.   
</div>



In [None]:
# Count plot to compare age of respondents in India and USA

plt.figure(figsize=(18,10)) # define figure size

sns.set(style="whitegrid", color_codes=True) # set style

# define count plot in seaborn
plot = sns.catplot(y='What is your age (# years)?', # define y-axis
                 hue='In which country do you currently reside?', # define hue
                 kind='count', # define kind of categorical plot
                 palette="pastel", # define color
                 edgecolor=".6", # define color of edge 
                 data=df1.sort_values(by='What is your age (# years)?')) # sort data frame for age groups

plt.ylabel('Age', 
           fontdict=
           {'fontsize': 15, 
            'color' : 'Blue'}) # rename y-axis

plot._legend.set_title('Country') # rename legend

plt.title('Age Distribution in Data Science Community', 
          fontdict={'fontsize': 20, 'color' : 'Blue'}) # title name

plt.show()


<div class="alert alert-block alert-info">
<b>Tip:</b> India has highest survey respondents in age group of 18-21. However, USA has highest number in age group of 30-34 which is closely followed by 25 -29. It clearly indicates that average age of data science enthusiasts in USA is more than that in India.   
</div>



In [None]:
# Count plot to comapre gender

sns.set(style="whitegrid", color_codes=True) # set style in seaborn

# Plot categorical graph for gender column 
plot = sns.catplot(y='What is your gender? - Selected Choice', 
                   hue='In which country do you currently reside?',
                   kind='count', 
                   palette="pastel", 
                   edgecolor=".6", 
                   data=df1.sort_values(by='What is your gender? - Selected Choice'))

# define y-axis
plt.ylabel('Gender', 
           fontdict=
           {'fontsize': 15, 
            'color' : 'Blue'})

# rename legend
plot._legend.set_title('Country')

# rename title
plt.title('Gender Distribution in Data Science', 
          fontdict={'fontsize': 20, 'color' : 'Blue'})

plt.show()


<div class="alert alert-block alert-info">
<b>Tip:</b> The field of data science is male dominant in India as well as USA. There are few respondents with non-binary and self-descriptive gender which is an encouraging sign for the future of any area.    
</div>



In [None]:
# Categorical plot of column for highest formal education

sns.set(style="whitegrid", color_codes=True) # set style

# define count plot 
plot = sns.catplot(y='What is the highest level of formal education that you have attained or plan to attain within the next 2 years?', 
                   hue='In which country do you currently reside?',
                   kind='count',
                   palette="pastel", 
                   edgecolor=".6", 
                   data=df1.sort_values(by='What is the highest level of formal education that you have attained or plan to attain within the next 2 years?'))
# define y-axis
plt.ylabel('Education', 
           fontdict=
           {'fontsize': 15, 
            'color' : 'Blue'})

# rename legend
plot._legend.set_title('Country')

# define title
plt.title('Distribution of Education in Data Science', 
          fontdict={'fontsize': 20, 'color' : 'Blue'})

plt.show()


<div class="alert alert-block alert-info">
<b>Tip:</b> India has highest proportion of Bachelor's degree holders in survey. However, USA had highest proportion Master's degree. The graph clearly indicates that data science enthusiasts in USA has high proportion of masters and doctorate degree. Whereas, India has higher proportion of bachelor and master degree in the field of data science.
</div>



In [None]:
# Count plot for Current role in DS

plt.figure(figsize=(18,10)) # define figure size

sns.set(style="whitegrid", color_codes=True) # set style of graph on seaborn

# define count plot
ax = sns.countplot(y='Select the title most similar to your current role (or most recent title if retired): - Selected Choice', 
                   hue='In which country do you currently reside?',
                   palette="pastel", 
                   edgecolor=".6",
                   data=df1.sort_values(by='Select the title most similar to your current role (or most recent title if retired): - Selected Choice'))

# redefine y-axis
plt.ylabel('Job Title', 
           fontdict=
           {'fontsize': 15, 
            'color' : 'Blue'})

# redefine legend
legend_labels, _= ax.get_legend_handles_labels()
ax.legend(legend_labels, ['India','USA'], bbox_to_anchor=(1,1))

# define title of the graph
plt.title('Distribution of Role in Data Science', 
          fontdict={'fontsize': 20, 'color' : 'Blue'})

plt.show()


<div class="alert alert-block alert-info">
<b>Tip:</b> India has highest survey respondents as students. However, USA has highest proportion of Data Scientist among respondents. It clearly indicates that maturity of data science field in USA is more whereas, data science is at a primitive stage in India.   
</div>



In [None]:
# Count plot for experience of programming

plt.figure(figsize=(18,10)) # define figure size

sns.set(style="whitegrid", color_codes=True) # define style of graph in seaborn

# define count plot
ax = sns.countplot(y='For how many years have you been writing code and/or programming?', 
                   hue='In which country do you currently reside?',
                   palette="pastel", 
                   edgecolor=".6",
                   data=df1.sort_values(by='For how many years have you been writing code and/or programming?'))

# covert numbers in to percentage
total = len(df1['For how many years have you been writing code and/or programming?'])
for p in ax.patches:
        percentage = '{:.1f}%'.format(100 * p.get_width()/total)
        x = p.get_x() + p.get_width() + 0.02
        y = p.get_y() + p.get_height()/2
        ax.annotate(percentage, (x, y))

# redefine y-axis
plt.ylabel('Programming Experience', 
           fontdict=
           {'fontsize': 15, 
            'color' : 'Blue'})

# redefine legend
legend_labels, _= ax.get_legend_handles_labels()
ax.legend(legend_labels, ['USA', 'India'], bbox_to_anchor=(1,1))

# define title
plt.title('Distribution of Coding Experience in Data Science', 
          fontdict={'fontsize': 20, 'color' : 'Blue'})

plt.show()


<div class="alert alert-block alert-info">
<b>Tip:</b> India has most frequent coding experience candidate of 1-2 years. However, USA has proportionally high programming experience of 3-5 years. It again indicated coding community of data science is more mature in USA. However, the number of coding community is high in India.  
</div>



In [None]:
# Count plot for usage of Python

plt.figure(figsize=(12,6)) # define figure size

sns.set(style="whitegrid", color_codes=True) # define style of plot in seaborn

# define count plot 
ax = sns.countplot(y='What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - Python', 
                   hue='In which country do you currently reside?',
                   palette="pastel", 
                   edgecolor=".6",
                   data=df1)

# convert numbers in bars to percentage
total = len(df1['What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - Python'])
for p in ax.patches:
        percentage = '{:.1f}%'.format(100 * p.get_width()/total)
        x = p.get_x() + p.get_width() + 0.02
        y = p.get_y() + p.get_height()/2
        ax.annotate(percentage, (x, y))

# redefine y-axis
plt.ylabel('Python User', 
           fontdict=
           {'fontsize': 15, 
            'color' : 'Blue'})

# redefine legend
legend_labels, _= ax.get_legend_handles_labels()
ax.legend(legend_labels, ['India','USA'], bbox_to_anchor=(1,1))

# define title
plt.title('Python User Distribution in Data Science', 
          fontdict={'fontsize': 20, 'color' : 'Blue'})

plt.show()




<div class="alert alert-block alert-info">
<b>Tip:</b> Number of users of python in India is 58.3% which is much higher than that in USA (21.1%).   
</div>



In [None]:
# Count plot for recommended coding language

plt.figure(figsize=(18,10)) # define figure size

sns.set(style="whitegrid", color_codes=True) # define style in seaborn

# define count plot
ax = sns.countplot(y='What programming language would you recommend an aspiring data scientist to learn first? - Selected Choice', 
                   hue='In which country do you currently reside?',
                   palette="pastel", 
                   edgecolor=".6",
                   data=df1)

# redefine y-axis
plt.ylabel('Programming Language', 
           fontdict=
           {'fontsize': 15, 
            'color' : 'Blue'})

# redefine legend
legend_labels, _= ax.get_legend_handles_labels()
ax.legend(legend_labels, ['India','USA'], bbox_to_anchor=(1,1))

# define title
plt.title('Programming Language Distribution in Data Science',
          fontdict={'fontsize': 20, 'color' : 'Blue'})

plt.show()


<div class="alert alert-block alert-info">
<b>Tip:</b> The recommended programming languages for data science enthusiasts is same for India and USA. The most popular coding language is python followed by R, SQL, C, C++ and java.   
</div>



In [None]:
# Count plot for computing platform

plt.figure(figsize=(18,10)) # define figure size

sns.set(style="whitegrid", color_codes=True) # define style on seaborn

# define count plot
ax = sns.countplot(y='What type of computing platform do you use most often for your data science projects? - Selected Choice', 
                   hue='In which country do you currently reside?',
                   palette="pastel", 
                   edgecolor=".6",
                   data=df1.sort_values(by='What type of computing platform do you use most often for your data science projects? - Selected Choice'))

# convert no. to percentage
total = len(df1['What type of computing platform do you use most often for your data science projects? - Selected Choice'])
for p in ax.patches:
        percentage = '{:.1f}%'.format(100 * p.get_width()/total)
        x = p.get_x() + p.get_width() + 0.02
        y = p.get_y() + p.get_height()/2
        ax.annotate(percentage, (x, y))

# redefine y-axis
plt.ylabel('Computing Platform', 
           fontdict=
           {'fontsize': 15, 
            'color' : 'Blue'})

# redefine legend
legend_labels, _= ax.get_legend_handles_labels()
ax.legend(legend_labels, ['USA','India'], bbox_to_anchor=(1,1))


# define title
plt.title('Distribution of Computing Platform in Data Science', 
          fontdict={'fontsize': 20, 'color' : 'Blue'})

plt.show()


<div class="alert alert-block alert-info">
<b>Tip:</b> India and USA has similar trend in terms of distribution of computing platform for data science project. It is personal computer or laptop followed by cloud computing platforms such as AWS, Azure, GCP. It clearly indicates that PC is most popular among data science community. However, cloud computing platforms has good growth prospect in data science followed by deep learning work stations such as NVIDIA, GTX, LambdaLabs, etc.
</div>



In [None]:
# Count plot for TPU usage

plt.figure(figsize=(18,10)) # define fu=igure size

sns.set(style="whitegrid", color_codes=True) # define style in seaborn

# define count plot 
ax = sns.countplot(y='Approximately how many times have you used a TPU (tensor processing unit)?', 
                   hue='In which country do you currently reside?',
                   palette="pastel", 
                   edgecolor=".6",
                   data=df1.sort_values(by='Approximately how many times have you used a TPU (tensor processing unit)?'))

# convert no. to percentage
total = len(df1['Approximately how many times have you used a TPU (tensor processing unit)?'])
for p in ax.patches:
        percentage = '{:.1f}%'.format(100 * p.get_width()/total)
        x = p.get_x() + p.get_width() + 0.02
        y = p.get_y() + p.get_height()/2
        ax.annotate(percentage, (x, y))

# redefine y-axis
plt.ylabel('Tensor Processing Unit Usage', 
           fontdict=
           {'fontsize': 15, 
            'color' : 'Blue'})

# redefine y-axis
legend_labels, _= ax.get_legend_handles_labels()
ax.legend(legend_labels, ['India','USA'], bbox_to_anchor=(1,1))

# define title 
plt.title('TPU Usage Distribution in Data Science', 
          fontdict={'fontsize': 20, 'color' : 'Blue'})

plt.show()


<div class="alert alert-block alert-info">
<b>Tip:</b> TPU usage folows same trend proportionally for India and USA. It goes as 'Never' for most of respondensts, followed by 2-5 times, 'Once', 6-25 times and lastly more than 25 times. It also indicates vast scope of growth in usage of Tensor Processing Units in the field of data science. 
</div>



In [None]:
# Count plot for ML use

plt.figure(figsize=(18,10)) # define figure size

sns.set(style="whitegrid", color_codes=True) # define style in seaborn

# define count plot
ax = sns.countplot(y='For how many years have you used machine learning methods?', 
               hue='In which country do you currently reside?',
               palette="pastel", 
               edgecolor=".6",
               data=df1.sort_values(by='For how many years have you used machine learning methods?'))

# convert number in to percentage
total = len(df1['For how many years have you used machine learning methods?'])
for p in ax.patches:
        percentage = '{:.1f}%'.format(100 * p.get_width()/total)
        x = p.get_x() + p.get_width() + 0.02
        y = p.get_y() + p.get_height()/2
        ax.annotate(percentage, (x, y))

# redefine y-axis
plt.ylabel('Machine Learning Experience', 
           fontdict=
           {'fontsize': 15, 
            'color' : 'Blue'})

# redefine legend
legend_labels, _= ax.get_legend_handles_labels()
ax.legend(legend_labels, ['USA','India'], bbox_to_anchor=(1,1))

# define title
plt.title('Distribution of ML Experience in Data Science', 
          fontdict={'fontsize': 20, 'color' : 'Blue'})

plt.show()


<div class="alert alert-block alert-info">
<b>Tip:</b> Usage of Machine Learning methods follows similar trend propotionally in India as  well as USA. It goes as under 1 year followed by 1-2 years. It clearly indicates that ML method have high scope to grow in futute of data science.   
</div>

