# Pandas use a list of values to select rows from a column

* filter pandas rows by exact match from a list
* filter pandas rows by partial match from a list

Bonus

* execute value counts on multiple columns
* vectorized operations

> Vectorization is the process of executing operations on entire arrays. 

In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', -1)

In [2]:
# read the data frame and see the data insight
df = pd.read_csv("../csv/stackoverflow/developer_survey_2018/survey_results_public.csv", low_memory=False)
print(df.shape)

(98855, 129)


In [3]:
df.head(2)

Unnamed: 0,Respondent,Hobby,OpenSource,Country,Student,Employment,FormalEducation,UndergradMajor,CompanySize,DevType,...,Exercise,Gender,SexualOrientation,EducationParents,RaceEthnicity,Age,Dependents,MilitaryUS,SurveyTooLong,SurveyEasy
0,1,Yes,No,Kenya,No,Employed part-time,"Bachelor’s degree (BA, BS, B.Eng., etc.)",Mathematics or statistics,20 to 99 employees,Full-stack developer,...,3 - 4 times per week,Male,Straight or heterosexual,"Bachelor’s degree (BA, BS, B.Eng., etc.)",Black or of African descent,25 - 34 years old,Yes,,The survey was an appropriate length,Very easy
1,3,Yes,Yes,United Kingdom,No,Employed full-time,"Bachelor’s degree (BA, BS, B.Eng., etc.)","A natural science (ex. biology, chemistry, physics)","10,000 or more employees",Database administrator;DevOps specialist;Full-stack developer;System administrator,...,Daily or almost every day,Male,Straight or heterosexual,"Bachelor’s degree (BA, BS, B.Eng., etc.)",White or of European descent,35 - 44 years old,Yes,,The survey was an appropriate length,Somewhat easy


In [4]:
df.UndergradMajor.value_counts()

Computer science, computer engineering, or software engineering          50336
Another engineering discipline (ex. civil, electrical, mechanical)       6945 
Information systems, information technology, or system administration    6507 
A natural science (ex. biology, chemistry, physics)                      3050 
Mathematics or statistics                                                2818 
Web development or web design                                            2418 
A business discipline (ex. accounting, finance, marketing)               1921 
A humanities discipline (ex. literature, history, philosophy)            1590 
A social science (ex. anthropology, psychology, political science)       1377 
Fine arts or performing arts (ex. graphic design, music, studio art)     1135 
I never declared a major                                                 693  
A health science (ex. nursing, pharmacy, radiology)                      246  
Name: UndergradMajor, dtype: int64

In [5]:
df[df['UndergradMajor'].isin(['Mathematics or statistics', 
                              'Web development or web design'])].head()

Unnamed: 0,Respondent,Hobby,OpenSource,Country,Student,Employment,FormalEducation,UndergradMajor,CompanySize,DevType,...,Exercise,Gender,SexualOrientation,EducationParents,RaceEthnicity,Age,Dependents,MilitaryUS,SurveyTooLong,SurveyEasy
0,1,Yes,No,Kenya,No,Employed part-time,"Bachelor’s degree (BA, BS, B.Eng., etc.)",Mathematics or statistics,20 to 99 employees,Full-stack developer,...,3 - 4 times per week,Male,Straight or heterosexual,"Bachelor’s degree (BA, BS, B.Eng., etc.)",Black or of African descent,25 - 34 years old,Yes,,The survey was an appropriate length,Very easy
32,51,Yes,No,United States,No,Employed full-time,"Bachelor’s degree (BA, BS, B.Eng., etc.)",Web development or web design,500 to 999 employees,Back-end developer;Designer;Front-end developer;Full-stack developer;Marketing or sales professional;Mobile developer,...,Daily or almost every day,Female,Straight or heterosexual,Associate degree,White or of European descent,18 - 24 years old,No,No,The survey was an appropriate length,Very easy
82,124,Yes,Yes,United Kingdom,No,Employed full-time,"Master’s degree (MA, MS, M.Eng., MBA, etc.)",Mathematics or statistics,"10,000 or more employees",Back-end developer;DevOps specialist;Front-end developer;Full-stack developer;Mobile developer,...,1 - 2 times per week,Male,Straight or heterosexual,"Bachelor’s degree (BA, BS, B.Eng., etc.)",White or of European descent,25 - 34 years old,Yes,,The survey was an appropriate length,Very easy
84,126,Yes,Yes,Argentina,"Yes, part-time",Employed full-time,Some college/university study without earning a degree,Web development or web design,Fewer than 10 employees,Mobile developer,...,1 - 2 times per week,Male,Straight or heterosexual,Some college/university study without earning a degree,,25 - 34 years old,No,,The survey was an appropriate length,Very easy
148,230,Yes,Yes,United States,No,Employed full-time,"Bachelor’s degree (BA, BS, B.Eng., etc.)",Mathematics or statistics,"1,000 to 4,999 employees",Data scientist or machine learning specialist,...,,,,,,,,,,


In [7]:
area_list = ['biology',  'physics', 'Computer', 'enginnering', 'pharmacy', 'psychology', 'graphic design',
             'music', 'art', 'studio art', 'accounting', 'finance', 'chemistry',]

In [8]:
import re
area_df = pd.DataFrame(dict((area, df.UndergradMajor.str.contains(area))
                             for area in area_list))
area_df.head(30)

Unnamed: 0,biology,physics,Computer,enginnering,pharmacy,psychology,graphic design,music,art,studio art,accounting,finance,chemistry
0,False,False,False,False,False,False,False,False,False,False,False,False,False
1,True,True,False,False,False,False,False,False,False,False,False,False,True
2,False,False,True,False,False,False,False,False,False,False,False,False,False
3,False,False,True,False,False,False,False,False,False,False,False,False,False
4,False,False,True,False,False,False,False,False,False,False,False,False,False
5,False,False,True,False,False,False,False,False,False,False,False,False,False
6,False,False,True,False,False,False,False,False,False,False,False,False,False
7,False,False,True,False,False,False,False,False,False,False,False,False,False
8,False,False,False,False,False,False,True,True,True,True,False,False,False
9,False,False,True,False,False,False,False,False,False,False,False,False,False


In [9]:
df.DevType.value_counts().head()

Back-end developer                                             6417
Full-stack developer                                           6104
Back-end developer;Front-end developer;Full-stack developer    4460
Mobile developer                                               3518
Student                                                        3222
Name: DevType, dtype: int64

In [10]:
dev_list = ['Mobile', 'Data', 'QA']

In [11]:
import re
dev_df = pd.DataFrame(dict((dev, df.DevType.str.contains(dev, re.IGNORECASE))
                             for dev in dev_list))
dev_df.head(20).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
Mobile,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False
Data,False,True,False,False,True,True,False,False,True,False,True,False,False,False,False,False,False,False,True,False
QA,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True


In [12]:
dev_df.QA.value_counts()

False    85904
True     6194 
Name: QA, dtype: int64

In [13]:
dev_df.apply(pd.Series.value_counts)

Unnamed: 0,Mobile,Data,QA
False,73294,70209,85904
True,18804,21889,6194


In [14]:
dev_df[['Mobile','QA']].apply(pd.Series.value_counts)

Unnamed: 0,Mobile,QA
False,73294,85904
True,18804,6194
