# What skills, tools and credentials are essential to break into data science

### Imports

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Functions I use over and over

In [None]:
def plot_value_counts(val_counts, title, x_label, y_label="Number of Occurences", pct=False, save=False):
    """
    Plot the number of occurences per item
    
    inputs
    val_counts -- a pandas series containing index and count
    title -- the title of the plot you are making
    x_label -- the label of the horizontal axis
    y_label -- the label of the vertical axis
    pct -- False if displaying a count, True if displaying a percentage
    save -- False if not saving plot, True if you would like to save the plot
    """
    plt.figure(figsize=(8,6))
    ax = sns.barplot(val_counts.index, val_counts.values, alpha=0.8)
    plt.title(title)
    plt.ylabel(y_label, fontsize=12)
    plt.xlabel(x_label, fontsize=12)

    plt.xticks(
        rotation=45, 
        horizontalalignment='right',
        fontweight='light',
        fontsize='x-large'  
        )

    for p in ax.patches:
        x_center = p.get_x() + (p.get_width() / 5)
        y = p.get_height()
        if pct:
            ax.annotate(str(p.get_height()), (x_center, y))
        else:
            ax.annotate(str(int(p.get_height())), (x_center, y))
        

    if save:
        plt.savefig(title, bbox_inches = "tight")
        
    plt.show()
    
    
def pull_columns_of_interest(question):
    """
    Several questions have multiple columns associated with them. This function will find all of the columns 
    associated with a particular question, and return a series containing the number of not NaN's of each column.
    
    question -- the question asked by the survey. 
    
    returns
    val_counts -- a series containing the number of not NaN values per column
    """
    
    columns_of_interest = []
    for col in survey.columns:
        if question in survey.loc[0, col]:
            columns_of_interest.append(col)
            
    df = data_scientists[columns_of_interest]
    
    new_index_map = {}
    for col in df.columns:
        print(col + ' = ' + df[col].mode()[0])
        new_index_map[col] = df[col].mode()[0]
        
    
    
    """
    The question was a select all that apply question, so NaN indicates that the survey taker did not check a 
    particular language. The number of non-null objects in each column is the number of respondents who selected 
    a particular choice that they agree with or use.
    """
    val_counts = df.notna().sum()

    
    val_counts = val_counts.rename(new_index_map).sort_values(ascending=False)
        
    return val_counts

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
survey = pd.read_csv('/kaggle/input/kaggle-survey-2020/kaggle_survey_2020_responses.csv')

In [None]:
survey.head()

In [None]:
#how many respondents?
print('Total number of respondents: ', survey.shape[0])

In [None]:
#In which country do you reside?
countries = survey.Q3.value_counts()
len(countries)

In [None]:
len(countries.drop('In which country do you currently reside?'))

In [None]:
role_counts = survey['Q5'].value_counts()[:-1]

question = survey.loc[0, 'Q5']

plot_value_counts(role_counts, question, 'Role', save=True)


We are interested into breaking into data science. I will split the take a subset of the data that only includes indivduals who claim thier current role is data science. 

In [None]:
data_scientists = survey[survey['Q5']=='Data Scientist']
print('number of data scientists in survey: ', data_scientists.shape[0])
data_scientists.head()

I want to break into the field... I'm about to graduate my undergaduate program. Do I need to get a Masters?

In [None]:
education = data_scientists.Q4.value_counts()

question = survey.loc[0, 'Q4']

plot_value_counts(education, question, 'Highest Education',save=True)

In [None]:
print('pct of data scientists with: ')
for i, v in education.items():
    print(f'\t{i} : {round(v/data_scientists.shape[0], 2)}')

What tools are being used?

<br>

What programming languages do data scientists use on a regular basis?

In [None]:
language_counts = pull_columns_of_interest('What programming languages do you use on a regular basis?')

In [None]:
language_counts

In [None]:
question = survey.loc[0, 'Q7_Part_1'].split('- Selected Choice -')[0]

plot_value_counts(language_counts, question, 'Language Used on Regular Basis', save=True)

In [None]:
# average number of languages used by data scienists on a regular basis
print('average number of languages used per data scientist: ', 
      language_counts.drop('None').sum()/data_scientists.shape[0])

In [None]:
pct = (language_counts / data_scientists.shape[0]).round(2)

plot_value_counts(pct, question, 'Language Used on Regular Basis', 'Percentage', pct=True)

92% of data scientist use Python on a regular basis! If you're in a hurry to break into the field, focus most of your efforts on learning Python. But don't stop there! On average, data scientists use 2.6 programming languages on a regular basis.

What programming language would you recommend an aspiring data scientist to learn first?

In [None]:
recommened_languages = data_scientists['Q8'].value_counts()

In [None]:
plot_value_counts(recommened_languages, 
                  'What programming language would you recommend an aspiring data scientist to learn first?', 
                  'Recommended Language')

Python it is!!

<br>

Speaking of learning, On which platforms have you begun or completed data science courses?

In [None]:
platform_counts = pull_columns_of_interest(
    'On which platforms have you begun or completed data science courses?'
    )

In [None]:
platform_counts

In [None]:
plot_value_counts(platform_counts, 'On which platforms have you begun or completed data science courses?',
                 'Learning Platform', save=True)

In [None]:
#average number of platforms...
print('number of platforms used: ',platform_counts.drop('None').sum())
print('average number of platforms used: ', platform_counts.drop('None').sum()/data_scientists.shape[0])
print('pct of data scientist who do not use any online learning platform: ', 
      platform_counts['None']/data_scientists.shape[0])

Interesting. Coursera seems to be clear favorite. I thought Kaggle Learn courses would be higher, given that the 
survey was conducted through Kaggle. 


We know a Bachelor's degree is important. An even more advanced degree seems to be helpful, but does not seem to be required.

We know we need to learn Python and SQL. We know that more data scientists use Coursera than any other platform... 

But going beyond language, what activities and tasks should we get good at...

Thankfully, Kaggle asked:

Select any activities that make up an important part of your role at work


We can use the answers to this question to get an even better understanding of what we should be practicing.

In [None]:
activities_count = pull_columns_of_interest(
    'Select any activities that make up an important part of your role at work'
    )

In [None]:
activities_count

In [None]:
plot_value_counts(activities_count,'Select any activities that make up an important part of your role at work',
                 'Activity')

Based on the results above, it seems that for most of the data scientists who completed the survey, two large portions of their work are:
* Analysis
* Machine Learning

For analysis, I'm interesting in looking at the following questions:

* Which of the following integrated development environments (IDE's) do you use on a regular basis?
* Which of the following hosted notebook products do you use on a regular basis?
* What is the primary tool that you use at work or school to analyze data?
* What data visualization libraries or tools do you use on a regular basis?


For ML, I'm interested in looking at the following questions:
* Which of the following machine learning frameworks do you use on a regular basis?
* Which of the following ML algorithms do you use on a regular basis?
* Does your current employer incorporate machine learning methods into their business?
* Do you use any of the following machine learning products on a regular basis?

In [None]:
#What IDE's do you use on a regular basis?
ide_counts = pull_columns_of_interest(
    "Which of the following integrated development environments (IDE's) do you use on a regular basis?")

In [None]:
ide_counts

In [None]:
plot_value_counts(ide_counts, 
                  "Which of the following integrated development environments (IDE's) do you use on a regular basis?", 
                  'IDE')

In [None]:
#What notebooks do you use?
notebook_counts = pull_columns_of_interest(
    "Which of the following hosted notebook products do you use on a regular basis?")

In [None]:
plot_value_counts(notebook_counts, "Which of the following hosted notebook products do you use on a regular basis?",
                 "Notebook")

In [None]:
#Primary analysis tool
analysis_tools_count = pull_columns_of_interest(
    "What is the primary tool that you use at work or school to analyze data?")

In [None]:
for col in survey.columns:
    if "What is the primary tool that you use at work or school to analyze data?" in survey.loc[0, col]:
        print(col)

In [None]:
tool_counts = data_scientists['Q38'].value_counts()
tool_counts

In [None]:
plot_value_counts(tool_counts, "What is the primary tool that you use at work or school to analyze data?",
                 "Tool")

In [None]:
#What visualization tools do you use on a regular basis?
viz_tools_count = pull_columns_of_interest(
    "What data visualization libraries or tools do you use on a regular basis?")

In [None]:
plot_value_counts(viz_tools_count, "What data visualization libraries or tools do you use on a regular basis?",
                 "Tool/Library")

In [None]:
#Which of the following machine learning frameworks do you use on a regular basis?
ml_frameworks_count = pull_columns_of_interest(
    "Which of the following machine learning frameworks do you use on a regular basis?")

In [None]:
plot_value_counts(ml_frameworks_count, 
                  "Which of the following machine learning frameworks do you use on a regular basis?",
                 "Machine Learning Framework")

In [None]:
# Which of the following ML algorithms do you use on a regular basis?
ml_algo_count = pull_columns_of_interest(
    "Which of the following ML algorithms do you use on a regular basis?")

In [None]:
plot_value_counts(ml_algo_count, "Which of the following ML algorithms do you use on a regular basis?",
                 "ML Algorithm")

In [None]:
#Does your current employer incorporate machine learning methods into their business?
employer_ml_count = pull_columns_of_interest(
    "Does your current employer incorporate machine learning methods into their business?")

In [None]:
employer_ml_counts = data_scientists['Q22'].value_counts()
employer_ml_counts

In [None]:
plot_value_counts(employer_ml_counts, 
                  "Does your current employer incorporate machine learning methods into their business?",
                 "Employer stance on ML")

In [None]:
#Do you use any of the following machine learning products on a regular basis?
ml_product_count = pull_columns_of_interest(
    "Do you use any of the following machine learning products on a regular basis?")

In [None]:
plot_value_counts(ml_product_count, 
                  "Do you use any of the following machine learning products on a regular basis?",
                 "ML Tool")

### If you made it this far, thank you!!!

## Any feedback is greatly appreciated!!