In [1]:
import pandas as pd
import numpy as np

In [2]:
## Reading the dataset
df = pd.read_csv('survey_results_public.csv')

FileNotFoundError: File b'survey_results_public.csv' does not exist

In [None]:
df.shape
## data has 88883 rows and 85 columns

In [None]:
## Regex to filter data professionals, less than 1 year  of professional coding experience
import re

In [None]:
junior_data = df[(df['YearsCodePro']=='Less than 1 year')&(df['DevType'].str.contains("data\s|\sdata$|\sdata\s",flags=re.IGNORECASE,regex=True).fillna(value=False))]

In [None]:
senior_data = df[~(df['YearsCodePro']=='Less than 1 year')&~(df['DevType'].str.contains("data\s|\sdata$|\sdata\s",flags=re.IGNORECASE,regex=True).fillna(value=False))]

In [None]:
## list of possible values of 'DevType'- developer type
possible_vals = ['Academic researcher','Data or business analyst','Data scientist or machine learning specialist','Database administrator',
'Designer','Developer, back-end','Developer, desktop or enterprise applications','Developer, embedded applications or devices','Developer, front-end',
'Developer, full-stack',
'Developer, game or graphics Developer, mobile',
'Developer, QA or test',
'DevOps specialist',
'Educator',
'Engineer, data',
'Engineer, site reliability','Engineering manager',
'Marketing or sales professional','Product manager',
'Scientist','Senior Executive (C-Suite, VP, etc.)','Student','System administrator','Other']

In [5]:

def total_count(df,col1,col2,lookfor):
    '''
    Returns a dataframe with the count of values in the input dataframe for the corresponding value
    in the lookfor list.
    
    Parameters: 
        df(dataframe): Input dataframe from the raw response data; a single respondent might have more than 
        one response separated by a semi-colon.
        col1: The first column in the df dataframe which contains the response.
        col2: The second column in the df dataframe which containes the count; number of times that response
        was found in the dataframe.
        lookfor(list): Contains the list of possible values of the responses.
        
    Returns: The function returns a dataframe with number of times each responses in the 'lookfor' list
    appeared in the input dataframe 'df'.
    
    '''
    
    from collections import defaultdict
    new_df = defaultdict(int)
    for val in lookfor:
        for idx in range(df.shape[0]):
            if val in df[col1][idx]:
                new_df[val]+= int(df[col2][idx])
    new_df = pd.DataFrame(pd.Series(new_df)).reset_index()
    new_df.columns = [col1,col2]
    new_df.sort_values('Count',ascending = False, inplace = True)
    return new_df

In [6]:
print(total_count.__doc__)

TypeError: total_count() missing 4 required positional arguments: 'df', 'col1', 'col2', and 'lookfor'

In [None]:
datatype = junior_data['DevType'].value_counts().reset_index()

In [None]:
datatype.rename(columns = {'index':'DevType','DevType':'Count'},inplace=True)

In [None]:
## dataframe created to represent the count of each devtypes
datatype_df = total_count(datatype,'DevType','Count',possible_vals)

In [None]:
## Adding percentage of each roles present
datatype_df['perc'] = datatype_df['Count']/558

### Who are these Junior Data Professionals?

In [None]:
import seaborn as sns

In [None]:
sns.barplot(x= datatype_df['Count'], y= datatype_df['DevType'])

#### Majority of them identify themselves as DataScientist or Machine Learning Specialist which is about 54 percent. About 51 percent as Data or Business Analyst. About 2 pecent of the respondents also identifed themselves as Marketing or Sales professional  and was the lowest among the responses. 

### Does the Junior Data Professional work longer on average compared to the total average number of working hours per week?

In [None]:
## mean hours for a junior data pro
junior_data['WorkWeekHrs'].mean()

In [None]:
## mean hours for a senior data pro
senior_data['WorkWeekHrs'].mean()

On average Junior Data Professional works 3.25 hours less than the general average

### What is the average salary of a Junior Data Professional?

In [None]:
## Box plot reveals the skew and outliers
junior_data.boxplot(column ='CompTotal',figsize=(5,20))

In [None]:
## Mean value of tge Total Compensation
junior_data['CompTotal'].mean()

In [None]:
## Median Value
junior_data['CompTotal'].median()

In [None]:
## Max value of Total Compensation
junior_data['CompTotal'].max()

In [None]:
## using median to fill the na values
junior_data.loc[:,'CompTotal'] = junior_data['CompTotal'].fillna(junior_data['CompTotal'].median())

In [None]:
## creating dataframe without outliers for Total Compensation
junior_data_CompTotal_out = junior_data[junior_data['CompTotal'].between(junior_data['CompTotal'].quantile(.15), junior_data['CompTotal'].quantile(.75))]

In [None]:
## Distribution the total compensation variable
junior_data_CompTotal_out['CompTotal'].plot(kind='hist')

In [None]:
## list of all possible values for LanguagesWorkedWith variable
possible_vals_tech = ['Assembly','Bash/Shell/PowerShell', 'C','C++','C#','Clojure Dart Elixir Erlang','F#',
                      'Go','HTML/CSS','Java','JavaScript','Kotlin','Objective-C','PHP',
                      'Python','R','Ruby','Rust','Scala','SQL','Swift','TypeScript','VBA','WebAssembly']

In [None]:
techtype = junior_data_CompTotal_out['LanguageWorkedWith'].value_counts().reset_index()

In [None]:
techtype.rename(columns = {'index':'skill','LanguageWorkedWith':'Count'},inplace=True)

In [None]:
tech_df = total_count(techtype,'skill','Count',possible_vals_tech)

In [None]:
## Bar char to represent the skill sets
sns.barplot(x= tech_df['Count'], y= new_df['skill'])

In [None]:
junior_data_CompTotal_out_high = junior_data_CompTotal_out[junior_data_CompTotal_out['CompTotal']>35000]

In [None]:
new_df.plot.bar(x='type',y='Count')

In [None]:
junior_data_CompTotal_out['CompTotal'].median()

In [None]:
techtype_df = total_count(techtype,'type','Count',possible_vals_tech)

In [None]:
df_CompTotal_out[df_CompTotal_out['CompTotal']>60000]['LanguageWorkedWith']

In [None]:
junior_data['CompTotal'].dtype

### How many years of code experience Junior data professional had?

In [None]:
junior_data_CompTotal_out['YearsCode'].value_counts()/junior_data_CompTotal_out.shape[0]

In [None]:
0.165179+0.120536+0.120536+0.107143+0.102679+0.093750

#### About 71 percent of the respondents had 1-6 years of coding experience