In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt

In [2]:
pd.set_option(
    'display.max_colwidth', 1000
)
pd.set_option('display.max_rows', 500)
pd.options.display.max_rows = 1000
pd.options.display.max_seq_items = 2000

In [None]:
# Look into getting more years of data, then seeing how the data roles evolved over time


In [3]:
df = pd.read_csv('survey_results_public.csv')
schema = pd.read_csv('survey_results_schema.csv')

In [4]:
df.columns

Index(['Respondent', 'MainBranch', 'Hobbyist', 'Age', 'Age1stCode', 'CompFreq',
       'CompTotal', 'ConvertedComp', 'Country', 'CurrencyDesc',
       'CurrencySymbol', 'DatabaseDesireNextYear', 'DatabaseWorkedWith',
       'DevType', 'EdLevel', 'Employment', 'Ethnicity', 'Gender', 'JobFactors',
       'JobSat', 'JobSeek', 'LanguageDesireNextYear', 'LanguageWorkedWith',
       'MiscTechDesireNextYear', 'MiscTechWorkedWith',
       'NEWCollabToolsDesireNextYear', 'NEWCollabToolsWorkedWith', 'NEWDevOps',
       'NEWDevOpsImpt', 'NEWEdImpt', 'NEWJobHunt', 'NEWJobHuntResearch',
       'NEWLearn', 'NEWOffTopic', 'NEWOnboardGood', 'NEWOtherComms',
       'NEWOvertime', 'NEWPurchaseResearch', 'NEWPurpleLink', 'NEWSOSites',
       'NEWStuck', 'OpSys', 'OrgSize', 'PlatformDesireNextYear',
       'PlatformWorkedWith', 'PurchaseWhat', 'Sexuality', 'SOAccount',
       'SOComm', 'SOPartFreq', 'SOVisitFreq', 'SurveyEase', 'SurveyLength',
       'Trans', 'UndergradMajor', 'WebframeDesireNextYear',
  

In [5]:
schema

Unnamed: 0,Column,QuestionText
0,Respondent,Randomized respondent ID number (not in order of survey response time)
1,MainBranch,"Which of the following options best describes you today? Here, by ""developer"" we mean ""someone who writes code."""
2,Hobbyist,Do you code as a hobby?
3,Age,"What is your age (in years)? If you prefer not to answer, you may leave this question blank."
4,Age1stCode,"At what age did you write your first line of code or program? (e.g., webpage, Hello World, Scratch project)"
5,CompFreq,"Is that compensation weekly, monthly, or yearly?"
6,CompTotal,"What is your current total compensation (salary, bonuses, and perks, before taxes and deductions), in `CurrencySymbol`? Please enter a whole number in the box below, without any punctuation. If you are paid hourly, please estimate an equivalent weekly, monthly, or yearly salary. If you prefer not to answer, please leave the box empty."
7,ConvertedComp,"Salary converted to annual USD salaries using the exchange rate on 2020-02-19, assuming 12 working months and 50 working weeks."
8,Country,Where do you live?
9,CurrencyDesc,"Which currency do you use day-to-day? If your answer is complicated, please pick the one you're most comfortable estimating in."


In [6]:
schema.loc[schema['Column'] == 'DatabaseDesireNextYear']

Unnamed: 0,Column,QuestionText
11,DatabaseDesireNextYear,"Which database environments have you done extensive development work in over the past year, and which do you want to work in over the next year? (If you both worked with the database and want to continue to do so, please check both boxes in that row.)"


In [24]:
df['DatabaseDesireNextYear'].value_counts()

PostgreSQL                                              2310
Microsoft SQL Server                                    1947
MySQL                                                   1823
MongoDB                                                 1624
SQLite                                                   966
                                                        ... 
Couchbase;Elasticsearch;Firebase;MongoDB;SQLite            1
DynamoDB;Elasticsearch;Oracle;SQLite                       1
DynamoDB;Elasticsearch;MongoDB;Oracle;PostgreSQL           1
Cassandra;DynamoDB;Firebase;MongoDB;MySQL;Redis            1
Cassandra;Elasticsearch;MariaDB;Microsoft SQL Server       1
Name: DatabaseDesireNextYear, Length: 3193, dtype: int64

In [7]:
dev_types = df['DevType'].value_counts().index.to_list()

In [8]:
# For now, these are the roles I'm interested in
# Do I want to add database administrator? Maybe later.
data_roles = [
    'Data scientist or machine learning specialist',
    'Data or business analyst',
    'Engineer, data',
]

In [26]:
dev_types

['Developer, full-stack',
 'Developer, back-end',
 'Developer, back-end;Developer, front-end;Developer, full-stack',
 'Developer, back-end;Developer, full-stack',
 'Developer, front-end',
 'Developer, mobile',
 'Developer, front-end;Developer, full-stack',
 'Developer, back-end;Developer, desktop or enterprise applications;Developer, front-end;Developer, full-stack',
 'Developer, back-end;Developer, desktop or enterprise applications',
 'Developer, desktop or enterprise applications',
 'Developer, back-end;Developer, front-end;Developer, full-stack;Developer, mobile',
 'Developer, back-end;Developer, desktop or enterprise applications;Developer, full-stack',
 'Developer, embedded applications or devices',
 'Data scientist or machine learning specialist',
 'Developer, full-stack;Developer, mobile',
 'Developer, back-end;Developer, front-end',
 'Developer, desktop or enterprise applications;Developer, full-stack',
 'Data or business analyst',
 'Developer, QA or test',
 'Developer, front-

In [7]:
# I need to write a loop to grab things in the dataframe
# for each role in the list, then concatenate them all, I think.
ds_df = df.loc[df['DevType'].str.contains('|'.join(data_roles), na=False)]
print(len(ds_df))

8726


In [8]:
ds_df['DevType'].value_counts()

Data scientist or machine learning specialist                                                                                                                                                                                                                                           369
Data or business analyst                                                                                                                                                                                                                                                                285
Developer, back-end;Engineer, data                                                                                                                                                                                                                                                      227
Engineer, data                                                                                                                                      

## Columns to Investigate

- Age
- ConvertedComp
- Country
- DatabaseWorkedWith
- EdLevel
- Employment (as a filter)
- JobSat
- LanguageWorkedWith
- LanguageDesireNextYear
- MiscTechWorkedWith
- NEWLearn
- NEWOvertime
- NEWStuck
- OpSys
- OrgSize
- UndergradMajor
- WorkWeekHrs
- YearsCode
- YearsCodePro

In [19]:
df = ds_df[[
    'DevType',
    'Age',
    'CompTotal',
    'ConvertedComp',
    'CompFreq',
    'Country',
    'DatabaseWorkedWith',
    'EdLevel',
    'Employment',
    'JobSat',
    'LanguageWorkedWith',
    'LanguageDesireNextYear',
    'MiscTechWorkedWith',
    'NEWLearn',
    'NEWOvertime',
    'NEWStuck',
    'OpSys',
    'OrgSize',
    'UndergradMajor',
    'WorkWeekHrs',
    'YearsCode',
    'YearsCodePro'
]]

In [17]:
df.head()

Unnamed: 0,DevType,Age,CompTotal,ConvertedComp,CompTotal.1,Country,DatabaseWorkedWith,EdLevel,Employment,JobSat,...,MiscTechWorkedWith,NEWLearn,NEWOvertime,NEWStuck,OpSys,OrgSize,UndergradMajor,WorkWeekHrs,YearsCode,YearsCodePro
21,"Developer, full-stack;Engineer, data",,,,,India,,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",Employed full-time,Slightly dissatisfied,...,,Every few months,Often: 1-2 days per week or more,,Windows,500 to 999 employees,"Computer science, computer engineering, or software engineering",50.0,10,2
24,"Developer, back-end;Developer, full-stack;DevOps specialist;Engineer, data;System administrator",,,,,Portugal,Oracle,"Associate degree (A.A., A.S., etc.)",Employed full-time,Neither satisfied nor dissatisfied,...,,Once every few years,Often: 1-2 days per week or more,Visit Stack Overflow,Windows,100 to 499 employees,"Computer science, computer engineering, or software engineering",40.0,23,18
29,"Data or business analyst;Database administrator;Developer, back-end;Developer, front-end;Developer, full-stack;System administrator",,30000.0,38778.0,30000.0,United Kingdom,Microsoft SQL Server;Redis;SQLite,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",Employed full-time,Slightly dissatisfied,...,Node.js,,Occasionally: 1-2 days per quarter but less than monthly,Play games;Call a coworker or friend;Visit Stack Overflow;Watch help / tutorial videos,Windows,,"Information systems, information technology, or system administration",37.0,4,2
35,Data or business analyst;Data scientist or machine learning specialist,34.0,60000.0,77556.0,60000.0,United Kingdom,IBM DB2;Microsoft SQL Server;MongoDB;SQLite,Some college/university study without earning a degree,Employed full-time,Slightly satisfied,...,Node.js;Pandas,Every few months,Sometimes: 1-2 days per month but less than weekly,Visit Stack Overflow;Go for a walk or other physical activity;Watch help / tutorial videos;Do other work and come back later,Windows,"1,000 to 4,999 employees","Computer science, computer engineering, or software engineering",40.0,4,3
43,"Data or business analyst;Developer, back-end;Product manager",32.0,244000.0,55893.0,244000.0,Brazil,Microsoft SQL Server,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",Employed full-time,Neither satisfied nor dissatisfied,...,,Once every few years,Often: 1-2 days per week or more,Call a coworker or friend;Visit Stack Overflow;Go for a walk or other physical activity;Watch help / tutorial videos,Windows,10 to 19 employees,"Another engineering discipline (such as civil, electrical, mechanical, etc.)",45.0,10,6


In [13]:
print((df['ConvertedComp'].max()))

2000000.0


In [20]:
df[df['ConvertedComp'] == 2000000.0]

Unnamed: 0,DevType,Age,CompTotal,ConvertedComp,CompFreq,Country,DatabaseWorkedWith,EdLevel,Employment,JobSat,...,MiscTechWorkedWith,NEWLearn,NEWOvertime,NEWStuck,OpSys,OrgSize,UndergradMajor,WorkWeekHrs,YearsCode,YearsCodePro
123,Data scientist or machine learning specialist;Scientist,41.0,200000.0,2000000.0,Monthly,United States,PostgreSQL,"Other doctoral degree (Ph.D., Ed.D., etc.)",Employed full-time,Very satisfied,...,Keras;Pandas;TensorFlow,Once every few years,Occasionally: 1-2 days per quarter but less than monthly,Visit Stack Overflow;Go for a walk or other physical activity;Watch help / tutorial videos;Visit another developer community (please name):,Windows,"5,000 to 9,999 employees",,40.0,11,11
722,"Developer, back-end;Engineer, data;System administrator",,103000.0,2000000.0,Weekly,United States,MongoDB,Some college/university study without earning a degree,Employed full-time,Very dissatisfied,...,Node.js,Once every few years,Often: 1-2 days per week or more,Visit Stack Overflow;Go for a walk or other physical activity;Watch help / tutorial videos,MacOS,100 to 499 employees,"Computer science, computer engineering, or software engineering",40.0,3,3
816,Data or business analyst,40.0,70000.0,2000000.0,Weekly,United States,IBM DB2;MySQL;PostgreSQL;SQLite,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",Employed full-time,Very dissatisfied,...,,Once a year,Occasionally: 1-2 days per quarter but less than monthly,Play games;Call a coworker or friend;Visit Stack Overflow;Panic;Watch help / tutorial videos;Do other work and come back later,Linux-based,100 to 499 employees,"A social science (such as anthropology, psychology, political science, etc.)",40.0,25,2
1581,"Data or business analyst;Data scientist or machine learning specialist;Engineer, data;Marketing or sales professional",59.0,94000.0,2000000.0,Weekly,United States,Microsoft SQL Server,,Employed full-time,Slightly dissatisfied,...,,Once every few years,Occasionally: 1-2 days per quarter but less than monthly,Visit Stack Overflow;Watch help / tutorial videos,Windows,500 to 999 employees,,,14,14
1616,"Data or business analyst;Database administrator;Designer;Developer, back-end;Developer, desktop or enterprise applications;Developer, front-end;Developer, full-stack;System administrator",48.0,72000.0,2000000.0,Weekly,United States,MySQL;PostgreSQL,Some college/university study without earning a degree,Employed full-time,Slightly satisfied,...,,Once every few years,Sometimes: 1-2 days per month but less than weekly,Visit Stack Overflow;Watch help / tutorial videos,Linux-based,10 to 19 employees,"Another engineering discipline (such as civil, electrical, mechanical, etc.)",40.0,40,25
2556,"Data or business analyst;Developer, full-stack",40.0,117000.0,2000000.0,Weekly,United States,Microsoft SQL Server;Oracle,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",Employed full-time,Very satisfied,...,Pandas,Once a year,Sometimes: 1-2 days per month but less than weekly,Call a coworker or friend;Visit Stack Overflow;Go for a walk or other physical activity;Do other work and come back later,Windows,"5,000 to 9,999 employees",Mathematics or statistics,40.0,20,13
2670,Data or business analyst;Data scientist or machine learning specialist,23.0,100000.0,2000000.0,Weekly,United States,MySQL;PostgreSQL,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",Employed full-time,Slightly dissatisfied,...,Keras,Once every few years,Occasionally: 1-2 days per quarter but less than monthly,Call a coworker or friend;Visit Stack Overflow,MacOS,"10,000 or more employees","Computer science, computer engineering, or software engineering",40.0,6,2
2761,Data or business analyst;Data scientist or machine learning specialist;DevOps specialist;Scientist;System administrator,45.0,115000.0,2000000.0,Weekly,United States,,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",Employed full-time,Very satisfied,...,,Once every few years,Sometimes: 1-2 days per month but less than weekly,Meditate;Call a coworker or friend;Visit Stack Overflow;Go for a walk or other physical activity;Panic;Watch help / tutorial videos;Do other work and come back later,Windows,"10,000 or more employees","A natural science (such as biology, chemistry, physics, etc.)",50.0,25,23
3058,"Data scientist or machine learning specialist;Developer, full-stack;DevOps specialist;Engineer, site reliability;System administrator",32.0,102000.0,2000000.0,Weekly,United States,Microsoft SQL Server;Redis,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",Employed full-time,Very satisfied,...,.NET Core;Pandas,Once a year,Sometimes: 1-2 days per month but less than weekly,Meditate;Play games;Call a coworker or friend;Visit Stack Overflow;Go for a walk or other physical activity;Panic;Watch help / tutorial videos;Do other work and come back later,Windows,"10,000 or more employees","Computer science, computer engineering, or software engineering",40.0,10,7
3170,"Data scientist or machine learning specialist;Database administrator;Developer, back-end;Developer, desktop or enterprise applications;Developer, front-end;Developer, full-stack;Developer, mobile;DevOps specialist;Product manager",25.0,78000.0,2000000.0,Weekly,United States,Microsoft SQL Server;Oracle;PostgreSQL;SQLite,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",Employed full-time,Very satisfied,...,.NET;.NET Core;Node.js;Unity 3D;Xamarin,Once a year,Occasionally: 1-2 days per quarter but less than monthly,Play games;Call a coworker or friend;Visit Stack Overflow;Go for a walk or other physical activity;Watch help / tutorial videos;Do other work and come back later,Windows,"10,000 or more employees","Computer science, computer engineering, or software engineering",40.0,7,3
