In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In order to make this analysis, a bit more easy to read, I have divided it into a series of notebooks:
Part 1 : Data Science Community’s Favourite media source?
Part 2 : Most Popular Machine Learning Tools and Algorithms
Part 3 : What's more popular Google Colab , Kaggle Kernels or Jupyter Notebooks?
Part 4 : Current State of Machine Learning in Industry

Since Kaggle is a platform where we share our ideas and latest discoveries, it will be great if we start by uncovering some of the most popular sources of information and learning within the community. For the purpose of this article, as discussed above, I have used Kaggle 2020 Survey Data.

So without a further ado, let’s just dive in and find out who is using what for keeping themselves up to date on ML/DS news.


> Kaggle, YouTube and Blogs are the most popular source of regular information and learning.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('../input/kaggle-survey-2020/kaggle_survey_2020_responses.csv')
df.head()

In [None]:
#Changing question numbers to text as headers for easier understanding
header = df.iloc[0]
df = df[1:]
df.columns = header

In [None]:
media_df = df.iloc[:,244:256]
media_df.columns

In [None]:
columns = ['Twitter', 'Email newsletters','Reddit','Kaggle','Course Forums','YouTube','Podcasts','Blogs','Journal Publications','Slack Communities','None', 'Other']
media_df.columns = columns
media_df.head()

# POPULAR MEDIA SOURCES USED BY DATA SCIENCE COMMUNITY

In [None]:
A ={}
for i in columns:
    c = media_df[i].value_counts()
    A[i] = c.values
A = pd.DataFrame(A)
A = A.T
A.reset_index(inplace = True)
A

In [None]:
figure, axis = plt.subplots()

plt.yticks(range(0,len(A['index'])), A['index'], rotation = 0)

plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = 'Helvetica'


plot = plt.barh(A['index'], A[0], color = 'darkblue', alpha = 0.2)

plt.rcParams['axes.edgecolor']='#333F4B'
plt.rcParams['axes.linewidth']= 0.8
plt.rcParams['xtick.color']='#333F4B'
plt.rcParams['ytick.color']='#333F4B'

for rectangle in plot:
        width = rectangle.get_width()
        plt.text(width + 220, rectangle.get_y() + 0.25, '%d' % int(width), ha='center', va = 'bottom')
        
axis.spines['top'].set_color('none')
axis.spines['right'].set_color('none')
axis.spines['left'].set_smart_bounds(True)
axis.spines['bottom'].set_smart_bounds(True)

plt.title('Media Sources used by Data Scientists', loc = 'center')

plt.show()

Kaggle Notebooks and its Forums have been the most popular source of information and entertainment for data scientists world over, followed by Youtube and Blogs such as Towards Data Science and Analytics Vidhya.

Now that we know the most popular sources, let’s dig a bit further and try to explore if this preferences changes with Age, Gender, Region, Education level,Years of experience and Role.

In [None]:
x = df.iloc[:,1:7]
media_df = media_df.assign(**x)
media_df

In [None]:
def counthueplot(column_1, column_2, data):
    fig, axs = plt.subplots()

    sns.set(style="darkgrid")

    plt.rcParams['axes.edgecolor']='#333F4B'
    plt.rcParams['axes.linewidth']= 0.8
    plt.rcParams['xtick.color']='#333F4B'
    plt.rcParams['ytick.color']='#333F4B'
    #percentage = lambda i: len(i) / float(len(data)) * 100
    sns.countplot(column_1,hue = column_2, data =data,palette="Set3")

    axs.spines['top'].set_color('none')
    axs.spines['right'].set_color('none')
    axs.spines['left'].set_smart_bounds(True)
    axs.spines['bottom'].set_smart_bounds(True)


    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

# POPULAR MEDIA SOURCES FOR ALL AGES

In [None]:
counthueplot(media_df['Blogs'],media_df['What is your age (# years)?'], media_df)
counthueplot(media_df['Kaggle'],media_df['What is your age (# years)?'], media_df)
counthueplot(media_df['YouTube'],media_df['What is your age (# years)?'], media_df)

Blogs as the most popular source of information among late vicenarians, a.k.a data scientists in their late twenties(25–29 years), followed by early vicenarians(22–24 years). Popularity of blogs seems to decrease as people age. This can be attributed to the fact that we have most aspiring and serving data scientists in the age group of 18–40 years.

Youtube have been the second most popular source of information among Data Scientists. But let’s see, what age groups are more attracted towards it. The age distribution is more or less similar to Blog’s audience, but with only difference that we have slightly more audience in 30 and above age group who prefer reading blogs rather than watching YouTube.

Kaggle, the winner of everyone’s attention does well for all age groups. Data Scientists of all ages trust Kaggle forums for their ultimate source of information and learning.

# DO WOMEN CHOOSE THEIR MEDIA SOURCE DIFFERENTLY?


In [None]:
media_df = media_df[(media_df['What is your gender? - Selected Choice'] == 'Man') | (media_df['What is your gender? - Selected Choice'] == 'Woman')]
counthueplot(media_df['Blogs'],media_df['What is your gender? - Selected Choice'], media_df)
counthueplot(media_df['Kaggle'],media_df['What is your gender? - Selected Choice'], media_df)
counthueplot(media_df['YouTube'],media_df['What is your gender? - Selected Choice'], media_df)

There have been many historical evidences of battle of sexes. But after analyzing the Kaggle dataset, we find that there is no such battle here. Women and Men all prefer Kaggle, Youtube and Blogs equally to keep themselves updated.

You might observe stark difference in count of Men and Women respondents. This may be due to underrepresentation of Women in Data Sciences.

# DOES EDUCATION LEVEL, LEVEL UP THE CHOICE OF MEDIA SOURCE ?

In [None]:
counthueplot(media_df['Blogs'],media_df['What is the highest level of formal education that you have attained or plan to attain within the next 2 years?'], media_df)
counthueplot(media_df['Kaggle'],media_df['What is the highest level of formal education that you have attained or plan to attain within the next 2 years?'], media_df)
counthueplot(media_df['YouTube'],media_df['What is the highest level of formal education that you have attained or plan to attain within the next 2 years?'], media_df)

All three media sources, be it Kaggle, Youtube or Blogs find a great audience in people with Master’s Degree, followed by Bachelor’s Degree. This can also be attributed to the fact that most of the Data Scientist either hold a Master’s or Bachelor’s Degree.

A very interesting observation is that Data Science Burghers with no formal education after High School, college dropouts and professional degree holder’s find Youtube as their favourite source of information and learning.


# NATIONALITIES AND PREFERENCE FOR MEDIA SOURCE

In [None]:
#media_df['In which country do you currently reside?'].value_counts().nlargest(20)
countries_data =df['In which country do you currently reside?'].value_counts().sort_index().reset_index().rename(columns={'index':'Country','In which country do you currently reside?':'Count'})
countries_data

In [None]:
import plotly.express as px

fig=px.choropleth(countries_data,
                  locations='Country',
                  color='Count',
                  locationmode='country names',
                  color_continuous_scale=px.colors.sequential.BuPu,
                  title='Data Science Geographical Spread',
                  range_color=[0,500],
                  labels={'Count':'# of Data Scientists'},
                  hover_data={'Country':True,'Count':True})
fig.update(layout=dict(title=dict(x=0.5)))
fig.show()

In [None]:
countries_data_kaggle = pd.DataFrame()
countries_data_kaggle = media_df[media_df['Kaggle'] == 'Kaggle (notebooks, forums, etc)']
countries_data_kaggle= countries_data_kaggle['In which country do you currently reside?'].value_counts().sort_index().reset_index().rename(columns={'index':'Country','In which country do you currently reside?':'Count'})

In [None]:
fig =px.choropleth(countries_data_kaggle,
                   locations='Country',
                   color='Count',
                   locationmode='country names',
                   color_continuous_scale=px.colors.sequential.BuPu,
                   title='Kaggle Forums as Media Source',
                   range_color=[0,500],labels={'Count':'# of Kaggle Users'},
                   hover_data={'Country':True,'Count':True})
fig.update(layout=dict(title=dict(x=0.5)))
fig.show()

In [None]:
countries_data_blogs = pd.DataFrame()
countries_data_blogs = media_df[media_df['Blogs'] == 'Blogs (Towards Data Science, Analytics Vidhya, etc)']
countries_data_blogs = countries_data_blogs['In which country do you currently reside?'].value_counts().sort_index().reset_index().rename(columns={'index':'Country','In which country do you currently reside?':'Count'})

In [None]:
fig =px.choropleth(countries_data_blogs,locations='Country',
                   color='Count',
                   locationmode='country names',
                   color_continuous_scale=px.colors.sequential.BuPu,
                   title='Blogs as Media Source',
                   range_color=[0,500],labels={'Count':'# of Blog Users'},
                   hover_data={'Country':True,'Count':True})
fig.update(layout=dict(title=dict(x=0.5)))
fig.show()


In [None]:

countries_data_youtube = pd.DataFrame()
countries_data_youtube = media_df[media_df['YouTube'] == 'YouTube (Kaggle YouTube, Cloud AI Adventures, etc)']
countries_data_youtube = countries_data_youtube['In which country do you currently reside?'].value_counts().sort_index().reset_index().rename(columns={'index':'Country','In which country do you currently reside?':'Count'})

In [None]:
fig =px.choropleth(countries_data_youtube,locations='Country',
                   color='Count',
                   locationmode='country names',
                   color_continuous_scale=px.colors.sequential.BuPu,
                   title='YouTube as Media Source',
                   range_color=[0,500],
                   labels={'Count':'# of YouTube Users'},
                   hover_data={'Country':True,'Count':True})
fig.update(layout=dict(title=dict(x=0.5)))
fig.show()

When we look at the popularity of a media sources across geographies, we conclude that Kaggle is the most popular source. But there were certain interesting observations such as Blogs are a little less popular in certain regions, particularly in Brazil compared to YouTube and Kaggle.

On the other hands, blogs are the most popular source of information in USA. Nearly, 37% of fellow data scientists rely on blogs.This could be attributed to the popularity of Medium in USA.

Youtube is the second most popular source of information across all countries except USA. Other countries such as India, Brazil, Russia, Japan, etc rely on YouTube after Kaggle.

Kaggle is the indisputable king of media sources in the data science community. This hold true evenly across geographies except for USA

# SHOULD MY CURRENT ROLE DEFINE MY CHOICE OF MEDIA SOURCE?

In [None]:
counthueplot(media_df['Blogs'],media_df['Select the title most similar to your current role (or most recent title if retired): - Selected Choice'], media_df)
counthueplot(media_df['Kaggle'],media_df['Select the title most similar to your current role (or most recent title if retired): - Selected Choice'], media_df)
counthueplot(media_df['YouTube'],media_df['Select the title most similar to your current role (or most recent title if retired): - Selected Choice'], media_df)

Yet again, Kaggle is the most popular media source across all roles. Albeit, we can observe inclination of people with Data Scientist and Statistician role towards blogs rather than YouTube as their regular source of information.

Kaggle is the most popular source of media for everyone, be it Data Scientist, Research Scientist, Machine Learning or any other role.

# DOES PREFERENCE FOR MEDIA SOURCE CHANGE WITH PROGRAMMING EXPERIENCE?

In [None]:
counthueplot(media_df['Blogs'],media_df['For how many years have you been writing code and/or programming?'], media_df)
counthueplot(media_df['Kaggle'],media_df['For how many years have you been writing code and/or programming?'], media_df)
counthueplot(media_df['YouTube'],media_df['For how many years have you been writing code and/or programming?'], media_df)

Kaggle is the first choice for most of the data scientists. But we see a general trend that across all platforms people engage progressively when they have 0 to 5 years of programming experience . After that, their interactions seems to decrease may due to their shift towards leadership roles.

YouTube is the go to media source for people with zero to 1 year of programming experience.

Kaggle is the favourite media source for data scientists with 1+ years of experience in programming. This can be attributed to the fact that once you know programming you might want to practice on challenges and what could be better than Kaggle.