In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [2]:
# Open the file `so_2021_survey_results.csv`, and read it into a data frame
filename = '../data/so_2021_survey_results.csv'
df = pd.read_csv(filename, usecols=['LanguageHaveWorkedWith', 
                                    'LanguageWantToWorkWith',
                                    'Country', 'CompTotal'])
df.head()

Unnamed: 0,Country,CompTotal,LanguageHaveWorkedWith,LanguageWantToWorkWith
0,Slovakia,4800.0,C++;HTML/CSS;JavaScript;Objective-C;PHP;Swift,Swift
1,Netherlands,,JavaScript;Python,
2,Russian Federation,,Assembly;C;Python;R;Rust,Julia;Python;Rust
3,Austria,,JavaScript;TypeScript,JavaScript;TypeScript
4,United Kingdom of Great Britain and Northern I...,,Bash/Shell;HTML/CSS;Python;SQL,Bash/Shell;HTML/CSS;Python;SQL


In [3]:
# What are the different programming languages that developers currently use?
(
    df['LanguageHaveWorkedWith']
    .str.split(';')
    .explode()
    .value_counts()
    .index
)

Index(['JavaScript', 'HTML/CSS', 'Python', 'SQL', 'Java', 'Node.js',
       'TypeScript', 'C#', 'Bash/Shell', 'C++', 'PHP', 'C', 'PowerShell', 'Go',
       'Kotlin', 'Rust', 'Ruby', 'Dart', 'Assembly', 'Swift', 'R', 'VBA',
       'Matlab', 'Groovy', 'Objective-C', 'Scala', 'Perl', 'Haskell', 'Delphi',
       'Clojure', 'Elixir', 'LISP', 'Julia', 'F#', 'Erlang', 'APL', 'Crystal',
       'COBOL'],
      dtype='object', name='LanguageHaveWorkedWith')

In [4]:
# What are the 10 programming languages most commonly used today?
have_worked_with = (
    df['LanguageHaveWorkedWith']
    .str.split(';')
    .explode()
    .value_counts()
    .head(10)
    .index
)

have_worked_with

Index(['JavaScript', 'HTML/CSS', 'Python', 'SQL', 'Java', 'Node.js',
       'TypeScript', 'C#', 'Bash/Shell', 'C++'],
      dtype='object', name='LanguageHaveWorkedWith')

In [5]:
# What are the 10 programming languages people most want to use?
want_to_work_with = (
    df['LanguageWantToWorkWith']
    .str.split(';')
    .explode()
    .value_counts()
    .head(10)
    .index
)
want_to_work_with

Index(['JavaScript', 'Python', 'HTML/CSS', 'TypeScript', 'SQL', 'Node.js',
       'C#', 'Java', 'Rust', 'Go'],
      dtype='object', name='LanguageWantToWorkWith')

In [6]:
# What languages are on both top-10 lists?

want_to_work_with.intersection(have_worked_with)

Index(['JavaScript', 'Python', 'HTML/CSS', 'TypeScript', 'SQL', 'Node.js',
       'C#', 'Java'],
      dtype='object')

In [7]:
# What languages in the top 10 have people worked with, but *don't* want to work with in the future?
(
    have_worked_with
    [~have_worked_with.isin(want_to_work_with)]
)


Index(['Bash/Shell', 'C++'], dtype='object', name='LanguageHaveWorkedWith')

In [8]:
# What is the most popular (current) language used by people in each country?

all_languages = (
    df
    ['LanguageHaveWorkedWith']
    .str.split(';')
    .explode()
)

(
    df[['Country']]
    .join(all_languages)
    .groupby('Country')
    .agg(pd.Series.mode)
)


Unnamed: 0_level_0,LanguageHaveWorkedWith
Country,Unnamed: 1_level_1
Afghanistan,JavaScript
Albania,JavaScript
Algeria,JavaScript
Andorra,JavaScript
Angola,"[HTML/CSS, JavaScript]"
...,...
"Venezuela, Bolivarian Republic of...",JavaScript
Viet Nam,JavaScript
Yemen,"[C#, HTML/CSS]"
Zambia,HTML/CSS


In [9]:
# What is the mean number of languages used in the last year?
df['LanguageHaveWorkedWith'].str.split(';').str.len().mean()

5.373678011583714

In [10]:
# What is the greatest number of languages people listed as having used in the last year?
df['LanguageHaveWorkedWith'].str.split(';').str.len().max()

38.0

In [11]:
# How many people chose that largest number?

(
    df
    .loc[df['LanguageHaveWorkedWith']
         .str.split(';')
         .str.len() == 38,
        'LanguageHaveWorkedWith']
    .count()
)

32

In [12]:
# How many people in the survey claim salaries of $2m or above?

(
    df
    .loc[df['CompTotal'] >= 2_000_000]
    ['CompTotal']
    .count()
)

2369

In [13]:
# Remove rows in which salaries are >= $2m
df = (
    df
    .loc[df['CompTotal'] < 2_000_000]
)

In [14]:
# Turn the 'LanguageHaveWorkedWith' column into "dummy" columns in df, such that
# each language is its own column.

df = (
    pd.concat([df, 
               df['LanguageHaveWorkedWith']
               .str.get_dummies(sep=';')],
              axis='columns')
)

df

Unnamed: 0,Country,CompTotal,LanguageHaveWorkedWith,LanguageWantToWorkWith,APL,Assembly,Bash/Shell,C,C#,C++,...,PowerShell,Python,R,Ruby,Rust,SQL,Scala,Swift,TypeScript,VBA
0,Slovakia,4800.0,C++;HTML/CSS;JavaScript;Objective-C;PHP;Swift,Swift,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
9,Sweden,42000.0,C++;Python,Haskell;Python,0,0,0,0,0,1,...,0,1,0,0,0,0,0,0,0,0
11,Spain,43000.0,Bash/Shell;HTML/CSS;JavaScript;Node.js;SQL;Typ...,C++;Clojure;JavaScript;Node.js;Rust;SQL;TypeSc...,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0
12,Germany,71500.0,C;C++;Java;Perl;Ruby,Rust,0,0,0,1,0,1,...,0,0,0,1,0,0,0,0,0,0
16,Turkey,9000.0,C#;HTML/CSS;Java;JavaScript;Node.js,C#;Java;JavaScript;Node.js,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83434,United States of America,160500.0,Clojure;Kotlin;SQL,Clojure,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
83435,Benin,200000.0,,,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
83436,United States of America,1800.0,Groovy;Java;Python,Java;Python,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
83437,Canada,90000.0,Bash/Shell;JavaScript;Node.js;Python,Go;Rust,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [15]:
# If you want to maximize your salary, and have to choose two languages from
# Python, JavaScript, and Java, then what combination would be best?

# Python + JavaScript, not Java
df['CompTotal'][(df['Python'] == 1) &
                (df['JavaScript'] == 1) &
                (df['Java'] == 0)].mean()

126817.99470235605

In [16]:
# Python + Java, not JavaScript
df['CompTotal'][(df['Python'] == 1) &
                (df['JavaScript'] == 0) &
                (df['Java'] == 1)].mean()

162737.10379596677

In [17]:
# Java and Javascript, not Python
df['CompTotal'][(df['Python'] == 0) &
                (df['JavaScript'] == 1) &
                (df['Java'] == 1)].mean()

140867.65981559738

In [19]:
s = Series('th|is i|s a bu|nch o|f te|xt'.split())
s

0     th|is
1       i|s
2         a
3    bu|nch
4       o|f
5     te|xt
dtype: object

In [21]:
s.str.split('|').len()

AttributeError: 'Series' object has no attribute 'len'