## Grouping, Aggregating

In [2]:
import pandas as pd

In [11]:
df = pd.read_csv('survey_results_public.csv')
df_schema = pd.read_csv('survey_results_schema.csv', index_col='Column')

In [12]:
df_schema.head()

Unnamed: 0_level_0,QuestionText
Column,Unnamed: 1_level_1
Respondent,Randomized respondent ID number (not in order ...
MainBranch,Which of the following options best describes ...
Hobbyist,Do you code as a hobby?
OpenSourcer,How often do you contribute to open source?
OpenSource,How do you feel about the quality of open sour...


In [13]:
df.shape

(88883, 85)

In [21]:
sorted(df.isnull().sum(), reverse=True)

[40708,
 39093,
 33060,
 32938,
 29297,
 28718,
 27775,
 27726,
 27724,
 27651,
 26854,
 26215,
 25939,
 25615,
 24380,
 24372,
 23861,
 21728,
 20742,
 20505,
 19969,
 19736,
 19323,
 18828,
 18599,
 18493,
 17895,
 17539,
 17491,
 17491,
 17104,
 17092,
 16036,
 14552,
 14191,
 13269,
 12857,
 12736,
 12215,
 11440,
 11006,
 9673,
 9512,
 9029,
 8397,
 8328,
 8169,
 7548,
 5824,
 5276,
 5006,
 4795,
 4623,
 4446,
 3517,
 3477,
 3028,
 2614,
 2539,
 2493,
 2220,
 2041,
 1899,
 1869,
 1802,
 1742,
 1702,
 1578,
 1566,
 1314,
 1249,
 1067,
 1055,
 1042,
 1032,
 945,
 817,
 797,
 752,
 620,
 552,
 132,
 0,
 0,
 0]

In [22]:
df.count()

Respondent      88883
MainBranch      88331
Hobbyist        88883
OpenSourcer     88883
OpenSource      86842
                ...  
Sexuality       76147
Ethnicity       76668
Dependents      83059
SurveyLength    86984
SurveyEase      87081
Length: 85, dtype: int64

In [23]:
df['ConvertedComp'].mean()

127110.73842323056

In [24]:
df['ConvertedComp'].median()

57287.0

In [25]:
df.Hobbyist.value_counts()

Yes    71257
No     17626
Name: Hobbyist, dtype: int64

In [26]:
df.SocialMedia

0          Twitter
1        Instagram
2           Reddit
3           Reddit
4         Facebook
           ...    
88878      YouTube
88879          NaN
88880          NaN
88881          NaN
88882     WhatsApp
Name: SocialMedia, Length: 88883, dtype: object

In [27]:
df.loc[:, 'SocialMedia']

0          Twitter
1        Instagram
2           Reddit
3           Reddit
4         Facebook
           ...    
88878      YouTube
88879          NaN
88880          NaN
88881          NaN
88882     WhatsApp
Name: SocialMedia, Length: 88883, dtype: object

In [29]:
df.loc[df.SocialMedia == 'Twitter', 'Respondent'].count()

11398

In [30]:
df_schema.loc['Hobbyist']

QuestionText    Do you code as a hobby?
Name: Hobbyist, dtype: object

In [31]:
df.SocialMedia.value_counts()

Reddit                      14374
YouTube                     13830
WhatsApp                    13347
Facebook                    13178
Twitter                     11398
Instagram                    6261
I don't use social media     5554
LinkedIn                     4501
WeChat 微信                     667
Snapchat                      628
VK ВКонта́кте                 603
Weibo 新浪微博                     56
Youku Tudou 优酷                 21
Hello                          19
Name: SocialMedia, dtype: int64

In [32]:
df.SocialMedia.value_counts(normalize=True)

Reddit                      0.170233
YouTube                     0.163791
WhatsApp                    0.158071
Facebook                    0.156069
Twitter                     0.134988
Instagram                   0.074150
I don't use social media    0.065777
LinkedIn                    0.053306
WeChat 微信                   0.007899
Snapchat                    0.007437
VK ВКонта́кте               0.007141
Weibo 新浪微博                  0.000663
Youku Tudou 优酷              0.000249
Hello                       0.000225
Name: SocialMedia, dtype: float64

In [33]:
x = df.SocialMedia.value_counts(normalize=True)

In [35]:
x.shape

(14,)

In [42]:
import numpy as np
c = np.random.randint(0,100,[14,])
c = pd.Series(c)

In [43]:
pd.concat([x, c], axis='columns')

Unnamed: 0,SocialMedia,0
0,,28.0
1,,25.0
2,,97.0
3,,79.0
4,,63.0
5,,7.0
6,,76.0
7,,35.0
8,,52.0
9,,14.0


In [44]:
df['Country'].value_counts()

United States       20949
India                9061
Germany              5866
United Kingdom       5737
Canada               3395
                    ...  
Chad                    1
Timor-Leste             1
Tonga                   1
Papua New Guinea        1
Dominica                1
Name: Country, Length: 179, dtype: int64

#### grouping has three steps to it --- Splitting Object, Applying Function, Combining Results

In [45]:
df.groupby(['Country'])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7efcec397bb0>

In [49]:
df.groupby(['Country']) # creates a groupby object -- can be considered a group of splitted dataframes - This is step 1 -- splitting the object

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7efcf6dcfeb0>

In [50]:
country_grp = df.groupby(['Country'])

In [53]:
country_grp.get_group('United States').head(3) #here we access the united stated mini df from the group of dfs

Unnamed: 0,Respondent,MainBranch,Hobbyist,OpenSourcer,OpenSource,Employment,Country,Student,EdLevel,UndergradMajor,...,WelcomeChange,SONewContent,Age,Gender,Trans,Sexuality,Ethnicity,Dependents,SurveyLength,SurveyEase
3,4,I am a developer by profession,No,Never,The quality of OSS and closed source software ...,Employed full-time,United States,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)","Computer science, computer engineering, or sof...",...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,22.0,Man,No,Straight / Heterosexual,White or of European descent,No,Appropriate in length,Easy
12,13,I am a developer by profession,Yes,Less than once a month but more than once per ...,"OSS is, on average, of HIGHER quality than pro...",Employed full-time,United States,No,"Master’s degree (MA, MS, M.Eng., MBA, etc.)","Computer science, computer engineering, or sof...",...,Somewhat more welcome now than last year,Tech articles written by other developers;Cour...,28.0,Man,No,Straight / Heterosexual,White or of European descent,Yes,Appropriate in length,Easy
21,22,I am a developer by profession,Yes,Less than once per year,"OSS is, on average, of HIGHER quality than pro...",Employed full-time,United States,No,Some college/university study without earning ...,,...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,47.0,Man,No,Straight / Heterosexual,White or of European descent,Yes,Appropriate in length,Easy


In [56]:
df.groupby(['Country']).get_group('India').head(3)

Unnamed: 0,Respondent,MainBranch,Hobbyist,OpenSourcer,OpenSource,Employment,Country,Student,EdLevel,UndergradMajor,...,WelcomeChange,SONewContent,Age,Gender,Trans,Sexuality,Ethnicity,Dependents,SurveyLength,SurveyEase
7,8,I code primarily as a hobby,Yes,Less than once per year,"OSS is, on average, of HIGHER quality than pro...","Not employed, but looking for work",India,,"Bachelor’s degree (BA, BS, B.Eng., etc.)","Computer science, computer engineering, or sof...",...,A lot more welcome now than last year,Tech articles written by other developers;Indu...,24.0,Man,No,Straight / Heterosexual,,,Appropriate in length,Neither easy nor difficult
9,10,I am a developer by profession,Yes,Once a month or more often,"OSS is, on average, of HIGHER quality than pro...",Employed full-time,India,No,"Master’s degree (MA, MS, M.Eng., MBA, etc.)",,...,Somewhat less welcome now than last year,Tech articles written by other developers;Tech...,,,,,,Yes,Too long,Difficult
14,15,I am a student who is learning to code,Yes,Never,"OSS is, on average, of HIGHER quality than pro...","Not employed, but looking for work",India,"Yes, full-time","Secondary school (e.g. American high school, G...",,...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,20.0,Man,No,,,Yes,Too long,Neither easy nor difficult


In [59]:
df.loc[df.Country == "India"]['SocialMedia'].value_counts()

WhatsApp                    2990
YouTube                     1820
LinkedIn                     955
Facebook                     841
Instagram                    822
Twitter                      542
Reddit                       473
I don't use social media     250
Snapchat                      23
WeChat 微信                      5
Hello                          5
VK ВКонта́кте                  4
Youku Tudou 优酷                 2
Weibo 新浪微博                     1
Name: SocialMedia, dtype: int64

In [65]:
df[df.Country == 'India'].nlargest(2,'ConvertedComp')['ConvertedComp']

21895    2000000.0
28080    2000000.0
Name: ConvertedComp, dtype: float64

In [68]:
df.groupby('Country').get_group('United States')['SocialMedia'].value_counts()

Reddit                      5700
Twitter                     3468
Facebook                    2844
YouTube                     2463
I don't use social media    1851
Instagram                   1652
LinkedIn                    1020
WhatsApp                     609
Snapchat                     326
WeChat 微信                     93
VK ВКонта́кте                  9
Weibo 新浪微博                     8
Hello                          2
Youku Tudou 优酷                 1
Name: SocialMedia, dtype: int64

In [69]:
df.groupby(['Country', 'Gender'])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7efcf2a33940>

In [74]:
df.groupby(['Country', 'Gender']).get_group(('India', 'Man')).head()

Unnamed: 0,Respondent,MainBranch,Hobbyist,OpenSourcer,OpenSource,Employment,Country,Student,EdLevel,UndergradMajor,...,WelcomeChange,SONewContent,Age,Gender,Trans,Sexuality,Ethnicity,Dependents,SurveyLength,SurveyEase
7,8,I code primarily as a hobby,Yes,Less than once per year,"OSS is, on average, of HIGHER quality than pro...","Not employed, but looking for work",India,,"Bachelor’s degree (BA, BS, B.Eng., etc.)","Computer science, computer engineering, or sof...",...,A lot more welcome now than last year,Tech articles written by other developers;Indu...,24.0,Man,No,Straight / Heterosexual,,,Appropriate in length,Neither easy nor difficult
14,15,I am a student who is learning to code,Yes,Never,"OSS is, on average, of HIGHER quality than pro...","Not employed, but looking for work",India,"Yes, full-time","Secondary school (e.g. American high school, G...",,...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,20.0,Man,No,,,Yes,Too long,Neither easy nor difficult
49,50,I am a developer by profession,Yes,Once a month or more often,"OSS is, on average, of LOWER quality than prop...",Employed full-time,India,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)","Another engineering discipline (ex. civil, ele...",...,Just as welcome now as I felt last year,Tech articles written by other developers;Tech...,23.0,Man,No,,South Asian,No,Too long,Easy
64,65,I am a developer by profession,Yes,Never,,Employed full-time,India,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)","Information systems, information technology, o...",...,A lot more welcome now than last year,,21.0,Man,No,,,Yes,Appropriate in length,Neither easy nor difficult
67,68,I am a developer by profession,Yes,Less than once a month but more than once per ...,"OSS is, on average, of LOWER quality than prop...",Employed full-time,India,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)",Web development or web design,...,Somewhat more welcome now than last year,Tech articles written by other developers;Indu...,29.0,Man,No,Straight / Heterosexual,South Asian,No,Appropriate in length,Easy


In [75]:
df.groupby('Country')['Gender'].value_counts().head(10)

Country      Gender                                                     
Afghanistan  Man                                                             26
             Woman                                                            2
             Non-binary, genderqueer, or gender non-conforming                1
             Woman;Man;Non-binary, genderqueer, or gender non-conforming      1
Albania      Man                                                             70
             Woman                                                           12
             Non-binary, genderqueer, or gender non-conforming                1
Algeria      Man                                                            107
             Woman                                                           12
             Man;Non-binary, genderqueer, or gender non-conforming            1
Name: Gender, dtype: int64

In [79]:
df.groupby('Country')['Gender'].value_counts(normalize=True) # here a series with multiple indexes is being returned

Country      Gender                                                     
Afghanistan  Man                                                            0.866667
             Woman                                                          0.066667
             Non-binary, genderqueer, or gender non-conforming              0.033333
             Woman;Man;Non-binary, genderqueer, or gender non-conforming    0.033333
Albania      Man                                                            0.843373
                                                                              ...   
Yemen        Woman;Man                                                      0.052632
Zambia       Man                                                            0.916667
             Woman                                                          0.083333
Zimbabwe     Man                                                            0.948718
             Woman                                                          0

In [80]:
s = df.groupby('Country')['Gender'].value_counts(normalize=True)
type(s)

pandas.core.series.Series

In [82]:
#here we can use .loc for series to get to a desired value

s.loc['India']

Gender
Man                                                            0.924846
Woman                                                          0.067568
Woman;Man                                                      0.003082
Non-binary, genderqueer, or gender non-conforming              0.002252
Woman;Man;Non-binary, genderqueer, or gender non-conforming    0.001304
Man;Non-binary, genderqueer, or gender non-conforming          0.000711
Woman;Non-binary, genderqueer, or gender non-conforming        0.000237
Name: Gender, dtype: float64

In [85]:
df.groupby('Country')['Gender'].value_counts(normalize = True).loc['India']

Gender
Man                                                            0.924846
Woman                                                          0.067568
Woman;Man                                                      0.003082
Non-binary, genderqueer, or gender non-conforming              0.002252
Woman;Man;Non-binary, genderqueer, or gender non-conforming    0.001304
Man;Non-binary, genderqueer, or gender non-conforming          0.000711
Woman;Non-binary, genderqueer, or gender non-conforming        0.000237
Name: Gender, dtype: float64

In [90]:
df.groupby('Country')['Gender'].value_counts(normalize=True)

Country      Gender                                                     
Afghanistan  Man                                                            0.866667
             Woman                                                          0.066667
             Non-binary, genderqueer, or gender non-conforming              0.033333
             Woman;Man;Non-binary, genderqueer, or gender non-conforming    0.033333
Albania      Man                                                            0.843373
                                                                              ...   
Yemen        Woman;Man                                                      0.052632
Zambia       Man                                                            0.916667
             Woman                                                          0.083333
Zimbabwe     Man                                                            0.948718
             Woman                                                          0

In [95]:
df.groupby('Country')['ConvertedComp'].apply(np.median).sort_values()

Country
Brunei Darussalam                         6096.0
Chad                                      6288.0
Saint Vincent and the Grenadines         16281.0
Timor-Leste                             229500.0
San Marino                              301788.0
                                          ...   
Venezuela, Bolivarian Republic of...         NaN
Viet Nam                                     NaN
Yemen                                        NaN
Zambia                                       NaN
Zimbabwe                                     NaN
Name: ConvertedComp, Length: 179, dtype: float64

In [96]:
df[['Country', 'Gender']].value_counts()

Country         Gender                                                     
United States   Man                                                            17700
India           Man                                                             7802
Germany         Man                                                             5289
United Kingdom  Man                                                             5060
Canada          Man                                                             2916
                                                                               ...  
Slovakia        Non-binary, genderqueer, or gender non-conforming                  1
Morocco         Woman;Man;Non-binary, genderqueer, or gender non-conforming        1
                Woman;Man                                                          1
Bulgaria        Man;Non-binary, genderqueer, or gender non-conforming              1
Portugal        Woman;Man                                                 

In [97]:
df.groupby('Country').mean()

Unnamed: 0_level_0,Respondent,CompTotal,ConvertedComp,WorkWeekHrs,CodeRevHrs,Age
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Afghanistan,47441.431818,5.418654e+07,101953.333333,55.352941,14.062500,27.913043
Albania,41695.546512,1.707424e+05,21833.700000,40.121212,5.781818,25.145833
Algeria,43010.126866,7.544398e+04,34924.047619,38.220930,6.413793,27.284404
Andorra,50079.857143,7.650000e+04,160931.000000,42.000000,2.500000,28.000000
Angola,50569.400000,2.020000e+05,7764.000000,45.000000,10.000000,26.666667
...,...,...,...,...,...,...
"Venezuela, Bolivarian Republic of...",43461.170455,1.174443e+05,14581.627907,40.444444,6.717949,28.301205
Viet Nam,43895.337662,2.613438e+07,17233.436782,44.602679,8.407767,26.739884
Yemen,45331.157895,1.375000e+05,16909.166667,41.250000,12.800000,28.117647
Zambia,42096.833333,2.774375e+04,10075.375000,48.125000,18.400000,28.000000


In [98]:
df.groupby('Country')['ConvertedComp'].median()

Country
Afghanistan                               6222.0
Albania                                  10818.0
Algeria                                   7878.0
Andorra                                 160931.0
Angola                                    7764.0
                                          ...   
Venezuela, Bolivarian Republic of...      6384.0
Viet Nam                                 11892.0
Yemen                                    11940.0
Zambia                                    5040.0
Zimbabwe                                 19200.0
Name: ConvertedComp, Length: 179, dtype: float64

In [100]:
df.groupby('Country')[['ConvertedComp', 'CompTotal']].median()

Unnamed: 0_level_0,ConvertedComp,CompTotal
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
Afghanistan,6222.0,39000.0
Albania,10818.0,65000.0
Algeria,7878.0,55000.0
Andorra,160931.0,76500.0
Angola,7764.0,202000.0
...,...,...
"Venezuela, Bolivarian Republic of...",6384.0,1000.0
Viet Nam,11892.0,15000000.0
Yemen,11940.0,125000.0
Zambia,5040.0,5000.0


In [101]:
df.groupby('Country')[['ConvertedComp', 'CompTotal']].median().loc['Germany']

ConvertedComp    63016.0
CompTotal        49000.0
Name: Germany, dtype: float64

In [102]:
type(df.groupby('Country')[['ConvertedComp', 'CompTotal']].median().loc['Germany'])

pandas.core.series.Series

In [103]:
x = {'Name': 'Sumi', 'Age' : '34'}
srs = pd.Series(x)

In [104]:
srs

Name    Sumi
Age       34
dtype: object

In [108]:
tdf = pd.DataFrame(x, index=[np.random.randint(0, 1+len(x.values()))])
tdf

Unnamed: 0,Name,Age
2,Sumi,34


In [109]:
df.groupby('Country')['ConvertedComp'].agg(['median', 'mean'])

Unnamed: 0_level_0,median,mean
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
Afghanistan,6222.0,101953.333333
Albania,10818.0,21833.700000
Algeria,7878.0,34924.047619
Andorra,160931.0,160931.000000
Angola,7764.0,7764.000000
...,...,...
"Venezuela, Bolivarian Republic of...",6384.0,14581.627907
Viet Nam,11892.0,17233.436782
Yemen,11940.0,16909.166667
Zambia,5040.0,10075.375000


In [110]:
df.groupby('Country')[['ConvertedComp', 'CompTotal']].agg(['median', 'mean'])

Unnamed: 0_level_0,ConvertedComp,ConvertedComp,CompTotal,CompTotal
Unnamed: 0_level_1,median,mean,median,mean
Country,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Afghanistan,6222.0,101953.333333,39000.0,5.418654e+07
Albania,10818.0,21833.700000,65000.0,1.707424e+05
Algeria,7878.0,34924.047619,55000.0,7.544398e+04
Andorra,160931.0,160931.000000,76500.0,7.650000e+04
Angola,7764.0,7764.000000,202000.0,2.020000e+05
...,...,...,...,...
"Venezuela, Bolivarian Republic of...",6384.0,14581.627907,1000.0,1.174443e+05
Viet Nam,11892.0,17233.436782,15000000.0,2.613438e+07
Yemen,11940.0,16909.166667,125000.0,1.375000e+05
Zambia,5040.0,10075.375000,5000.0,2.774375e+04


In [111]:
df.groupby('Country')[['ConvertedComp', 'CompTotal']].agg('median')

Unnamed: 0_level_0,ConvertedComp,CompTotal
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
Afghanistan,6222.0,39000.0
Albania,10818.0,65000.0
Algeria,7878.0,55000.0
Andorra,160931.0,76500.0
Angola,7764.0,202000.0
...,...,...
"Venezuela, Bolivarian Republic of...",6384.0,1000.0
Viet Nam,11892.0,15000000.0
Yemen,11940.0,125000.0
Zambia,5040.0,5000.0


In [117]:
df.groupby('Country')[['ConvertedComp', 'CompTotal']].agg(['mean', 'median']).loc['India'].loc['ConvertedComp'].loc['mean']

28057.664916229056

In [126]:
df.groupby('Country')['Respondent'].count().nlargest(3)

Country
United States    20949
India             9061
Germany           5866
Name: Respondent, dtype: int64

In [127]:
df.groupby('Country').count().nlargest(3, 'Respondent')

Unnamed: 0_level_0,Respondent,MainBranch,Hobbyist,OpenSourcer,OpenSource,Employment,Student,EdLevel,UndergradMajor,EduOther,...,WelcomeChange,SONewContent,Age,Gender,Trans,Sexuality,Ethnicity,Dependents,SurveyLength,SurveyEase
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
United States,20949,20877,20949,20949,20543,20663,20755,20605,18753,20461,...,20543,15259,18864,20177,20018,18944,19238,20088,20627,20684
India,9061,9035,9061,9061,8772,8808,8705,8699,7967,7930,...,8580,8012,7307,8436,7961,6228,5449,7843,8703,8688
Germany,5866,5838,5866,5866,5756,5723,5768,5674,4465,5649,...,5629,4317,5370,5667,5608,5045,5252,5465,5766,5777


In [139]:
df.groupby(['Country', 'Gender'])[['ConvertedComp', 'CompTotal']].agg(['mean', 'median']).loc['Afghanistan', 'CompTotal'].loc['Man', 'median']

39000.0

In [142]:
df.loc[(df.Country == 'India') & (df.LanguageWorkedWith.str.contains('Python'))].head(3)

Unnamed: 0,Respondent,MainBranch,Hobbyist,OpenSourcer,OpenSource,Employment,Country,Student,EdLevel,UndergradMajor,...,WelcomeChange,SONewContent,Age,Gender,Trans,Sexuality,Ethnicity,Dependents,SurveyLength,SurveyEase
7,8,I code primarily as a hobby,Yes,Less than once per year,"OSS is, on average, of HIGHER quality than pro...","Not employed, but looking for work",India,,"Bachelor’s degree (BA, BS, B.Eng., etc.)","Computer science, computer engineering, or sof...",...,A lot more welcome now than last year,Tech articles written by other developers;Indu...,24.0,Man,No,Straight / Heterosexual,,,Appropriate in length,Neither easy nor difficult
9,10,I am a developer by profession,Yes,Once a month or more often,"OSS is, on average, of HIGHER quality than pro...",Employed full-time,India,No,"Master’s degree (MA, MS, M.Eng., MBA, etc.)",,...,Somewhat less welcome now than last year,Tech articles written by other developers;Tech...,,,,,,Yes,Too long,Difficult
49,50,I am a developer by profession,Yes,Once a month or more often,"OSS is, on average, of LOWER quality than prop...",Employed full-time,India,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)","Another engineering discipline (ex. civil, ele...",...,Just as welcome now as I felt last year,Tech articles written by other developers;Tech...,23.0,Man,No,,South Asian,No,Too long,Easy


In [144]:
df.loc[df.Country == 'India']['LanguageWorkedWith'].str.contains('Python').sum()

3105

In [157]:
df.groupby('Country')['LanguageWorkedWith'].apply(lambda x: x.str.contains('Python').sum())
#here the object is broken into a group of series, hence the need to use apply

Country
Afghanistan                              8
Albania                                 23
Algeria                                 40
Andorra                                  0
Angola                                   2
                                        ..
Venezuela, Bolivarian Republic of...    28
Viet Nam                                78
Yemen                                    3
Zambia                                   4
Zimbabwe                                14
Name: LanguageWorkedWith, Length: 179, dtype: int64

In [155]:
df.groupby('Country').get_group('India')['LanguageWorkedWith'].str.contains('Python').sum() 
#here getting the group returns a single series when asked for 1 col

3105

#### Q> Calculate the % of developers in a country that have worked with Python?

In [160]:
#MySoln::

df.groupby('Country')['LanguageWorkedWith'].apply(lambda x: x.str.contains('Python').value_counts(normalize = True))

Country           
Afghanistan  False    0.794872
             True     0.205128
Albania      False    0.722892
             True     0.277108
Algeria      False    0.682540
                        ...   
Yemen        True     0.176471
Zambia       False    0.666667
             True     0.333333
Zimbabwe     False    0.641026
             True     0.358974
Name: LanguageWorkedWith, Length: 335, dtype: float64

In [165]:
#CoreysSoln::

country_respondents = df['Country'].value_counts()
country_uses_python = df.groupby('Country')['LanguageWorkedWith'].apply(lambda x: x.str.contains('Python').sum())


In [167]:
pd.concat([country_respondents, country_uses_python],  axis = 'columns', sort = False)

Unnamed: 0,Country,LanguageWorkedWith
United States,20949,10083
India,9061,3105
Germany,5866,2451
United Kingdom,5737,2384
Canada,3395,1558
...,...,...
Chad,1,0
Timor-Leste,1,1
Tonga,1,0
Papua New Guinea,1,0


In [171]:
df_t = pd.concat([country_respondents, country_uses_python],  axis = 'columns', sort = False)
df_t.rename(columns = {'Country': 'num_respondents', 'LanguageWorkedWith': 'num_pythons'}, inplace=True)

In [172]:
df_t.head()

Unnamed: 0,num_respondents,num_pythons
United States,20949,10083
India,9061,3105
Germany,5866,2451
United Kingdom,5737,2384
Canada,3395,1558


In [173]:
df_t['percentage_ppl_python_country'] = (df_t.num_pythons/df_t.num_respondents) * 100

In [174]:
df_t.head()

Unnamed: 0,num_respondents,num_pythons,percentage_ppl_python_country
United States,20949,10083,48.131176
India,9061,3105,34.267741
Germany,5866,2451,41.783157
United Kingdom,5737,2384,41.55482
Canada,3395,1558,45.891016


In [175]:
df_t.sort_values(by='percentage_ppl_python_country', ascending=False)

Unnamed: 0,num_respondents,num_pythons,percentage_ppl_python_country
Dominica,1,1,100.000000
Timor-Leste,1,1,100.000000
Niger,1,1,100.000000
Sao Tome and Principe,1,1,100.000000
Turkmenistan,7,6,85.714286
...,...,...,...
Lao People's Democratic Republic,3,0,0.000000
Liberia,2,0,0.000000
Cape Verde,3,0,0.000000
Gabon,2,0,0.000000


In [184]:
df_t[df_t.num_respondents > 100].sort_values(by=['percentage_ppl_python_country', 'num_respondents'], ascending=[False, False])

Unnamed: 0,num_respondents,num_pythons,percentage_ppl_python_country
South Korea,160,80,50.000000
Chile,206,102,49.514563
Finland,546,266,48.717949
Kenya,249,120,48.192771
United States,20949,10083,48.131176
...,...,...,...
Ukraine,868,246,28.341014
Bulgaria,659,181,27.465857
Other Country (Not Listed Above),136,37,27.205882
Pakistan,923,251,27.193933


In [181]:
df_t[df_t.num_respondents > 100].sort_values(by=['percentage_ppl_python_country', 'num_respondents'], ascending=[False, False]).loc['Germany']

num_respondents                  5866.000000
num_pythons                      2451.000000
percentage_ppl_python_country      41.783157
Name: Germany, dtype: float64