# Basic Operations on Stackoverflow Survey Dataset

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data\survey_results_public.csv')
df_schema = pd.read_csv('data\survey_results_schema.csv')

In [3]:
print(df.shape)
print(df_schema.shape)

(64461, 61)
(61, 2)


In [4]:
df.head()

Unnamed: 0,Respondent,MainBranch,Hobbyist,Age,Age1stCode,CompFreq,CompTotal,ConvertedComp,Country,CurrencyDesc,...,SurveyEase,SurveyLength,Trans,UndergradMajor,WebframeDesireNextYear,WebframeWorkedWith,WelcomeChange,WorkWeekHrs,YearsCode,YearsCodePro
0,1,I am a developer by profession,Yes,,13,Monthly,,,Germany,European Euro,...,Neither easy nor difficult,Appropriate in length,No,"Computer science, computer engineering, or sof...",ASP.NET Core,ASP.NET;ASP.NET Core,Just as welcome now as I felt last year,50.0,36,27.0
1,2,I am a developer by profession,No,,19,,,,United Kingdom,Pound sterling,...,,,,"Computer science, computer engineering, or sof...",,,Somewhat more welcome now than last year,,7,4.0
2,3,I code primarily as a hobby,Yes,,15,,,,Russian Federation,,...,Neither easy nor difficult,Appropriate in length,,,,,Somewhat more welcome now than last year,,4,
3,4,I am a developer by profession,Yes,25.0,18,,,,Albania,Albanian lek,...,,,No,"Computer science, computer engineering, or sof...",,,Somewhat less welcome now than last year,40.0,7,4.0
4,5,"I used to be a developer by profession, but no...",Yes,31.0,16,,,,United States,,...,Easy,Too short,No,"Computer science, computer engineering, or sof...",Django;Ruby on Rails,Ruby on Rails,Just as welcome now as I felt last year,,15,8.0


In [5]:
df_schema.head()

Unnamed: 0,Column,QuestionText
0,Respondent,Randomized respondent ID number (not in order ...
1,MainBranch,Which of the following options best describes ...
2,Hobbyist,Do you code as a hobby?
3,Age,What is your age (in years)? If you prefer not...
4,Age1stCode,At what age did you write your first line of c...


## Selecting Column(s) 

In [6]:
#Single Selection returns a Series
df['Age']

0         NaN
1         NaN
2         NaN
3        25.0
4        31.0
         ... 
64456     NaN
64457     NaN
64458     NaN
64459     NaN
64460     NaN
Name: Age, Length: 64461, dtype: float64

In [7]:
# Multi Selection returns new DF
df[['Age','Hobbyist']]


Unnamed: 0,Age,Hobbyist
0,,Yes
1,,No
2,,Yes
3,25.0,Yes
4,31.0,Yes
...,...,...
64456,,Yes
64457,,Yes
64458,,Yes
64459,,Yes


In [8]:
# df.columns returns list of columns, so using range passes list of filtered columns names
df[df.columns[1:3]]


Unnamed: 0,MainBranch,Hobbyist
0,I am a developer by profession,Yes
1,I am a developer by profession,No
2,I code primarily as a hobby,Yes
3,I am a developer by profession,Yes
4,"I used to be a developer by profession, but no...",Yes
...,...,...
64456,,Yes
64457,,Yes
64458,,Yes
64459,,Yes


## Selecting Row(s) using `range`

In [9]:
df[1:3]

Unnamed: 0,Respondent,MainBranch,Hobbyist,Age,Age1stCode,CompFreq,CompTotal,ConvertedComp,Country,CurrencyDesc,...,SurveyEase,SurveyLength,Trans,UndergradMajor,WebframeDesireNextYear,WebframeWorkedWith,WelcomeChange,WorkWeekHrs,YearsCode,YearsCodePro
1,2,I am a developer by profession,No,,19,,,,United Kingdom,Pound sterling,...,,,,"Computer science, computer engineering, or sof...",,,Somewhat more welcome now than last year,,7,4.0
2,3,I code primarily as a hobby,Yes,,15,,,,Russian Federation,,...,Neither easy nor difficult,Appropriate in length,,,,,Somewhat more welcome now than last year,,4,


## Selecting Row And Column using `loc`

`When using loc , the last value is *inclusive*`

In [10]:
#Selection of row 0
df.loc[0]

Respondent                                                  1
MainBranch                     I am a developer by profession
Hobbyist                                                  Yes
Age                                                       NaN
Age1stCode                                                 13
                                       ...                   
WebframeWorkedWith                       ASP.NET;ASP.NET Core
WelcomeChange         Just as welcome now as I felt last year
WorkWeekHrs                                                50
YearsCode                                                  36
YearsCodePro                                               27
Name: 0, Length: 61, dtype: object

In [11]:
#Selection of row 0,1,2 and with Age and COuntry Column
df.loc[[0,1,2],['Age','Country']]

Unnamed: 0,Age,Country
0,,Germany
1,,United Kingdom
2,,Russian Federation


In [12]:
#Selection of row of range 0 to 10 and with Age and COuntry Column
#When using slice, dont use []
df.loc[0:10,['Age','Country']]

Unnamed: 0,Age,Country
0,,Germany
1,,United Kingdom
2,,Russian Federation
3,25.0,Albania
4,31.0,United States
5,,Germany
6,,India
7,36.0,United States
8,30.0,Tunisia
9,22.0,United Kingdom


In [13]:
df.loc[ 0:10, 'Hobbyist' : 'Country' ]

Unnamed: 0,Hobbyist,Age,Age1stCode,CompFreq,CompTotal,ConvertedComp,Country
0,Yes,,13,Monthly,,,Germany
1,No,,19,,,,United Kingdom
2,Yes,,15,,,,Russian Federation
3,Yes,25.0,18,,,,Albania
4,Yes,31.0,16,,,,United States
5,No,,14,,,,Germany
6,Yes,,18,Monthly,,,India
7,Yes,36.0,12,Yearly,116000.0,116000.0,United States
8,No,30.0,20,,,,Tunisia
9,Yes,22.0,14,Yearly,25000.0,32315.0,United Kingdom


In [14]:
df.loc[ 0:5, df.columns[2:6] ]

Unnamed: 0,Hobbyist,Age,Age1stCode,CompFreq
0,Yes,,13,Monthly
1,No,,19,
2,Yes,,15,
3,Yes,25.0,18,
4,Yes,31.0,16,
5,No,,14,


### Experiments

In [15]:
#Getting only hobbyist column
df['Hobbyist']

0        Yes
1         No
2        Yes
3        Yes
4        Yes
        ... 
64456    Yes
64457    Yes
64458    Yes
64459    Yes
64460    Yes
Name: Hobbyist, Length: 64461, dtype: object

In [16]:
df.loc[0:2,'Hobbyist':'Employment']

Unnamed: 0,Hobbyist,Age,Age1stCode,CompFreq,CompTotal,ConvertedComp,Country,CurrencyDesc,CurrencySymbol,DatabaseDesireNextYear,DatabaseWorkedWith,DevType,EdLevel,Employment
0,Yes,,13,Monthly,,,Germany,European Euro,EUR,Microsoft SQL Server,Elasticsearch;Microsoft SQL Server;Oracle,"Developer, desktop or enterprise applications;...","Master’s degree (M.A., M.S., M.Eng., MBA, etc.)","Independent contractor, freelancer, or self-em..."
1,No,,19,,,,United Kingdom,Pound sterling,GBP,,,"Developer, full-stack;Developer, mobile","Bachelor’s degree (B.A., B.S., B.Eng., etc.)",Employed full-time
2,Yes,,15,,,,Russian Federation,,,,,,,


In [17]:
df.columns

Index(['Respondent', 'MainBranch', 'Hobbyist', 'Age', 'Age1stCode', 'CompFreq',
       'CompTotal', 'ConvertedComp', 'Country', 'CurrencyDesc',
       'CurrencySymbol', 'DatabaseDesireNextYear', 'DatabaseWorkedWith',
       'DevType', 'EdLevel', 'Employment', 'Ethnicity', 'Gender', 'JobFactors',
       'JobSat', 'JobSeek', 'LanguageDesireNextYear', 'LanguageWorkedWith',
       'MiscTechDesireNextYear', 'MiscTechWorkedWith',
       'NEWCollabToolsDesireNextYear', 'NEWCollabToolsWorkedWith', 'NEWDevOps',
       'NEWDevOpsImpt', 'NEWEdImpt', 'NEWJobHunt', 'NEWJobHuntResearch',
       'NEWLearn', 'NEWOffTopic', 'NEWOnboardGood', 'NEWOtherComms',
       'NEWOvertime', 'NEWPurchaseResearch', 'NEWPurpleLink', 'NEWSOSites',
       'NEWStuck', 'OpSys', 'OrgSize', 'PlatformDesireNextYear',
       'PlatformWorkedWith', 'PurchaseWhat', 'Sexuality', 'SOAccount',
       'SOComm', 'SOPartFreq', 'SOVisitFreq', 'SurveyEase', 'SurveyLength',
       'Trans', 'UndergradMajor', 'WebframeDesireNextYear',
  

In [18]:
#Setting new Index for the DF

# or df = pd.read_csv('data\survey_results_public.csv',index_col ='Respondent')

df.set_index('Respondent' , inplace=True)

In [19]:
#Reset Index
#df.reset_index(inplace=True)

In [20]:
#For looking up of that each column meant
df_schema.set_index('Column',inplace=True)


In [21]:
# getting what a column meant

df_schema.loc['MainBranch','QuestionText']

'Which of the following options best describes you today? Here, by "developer" we mean "someone who writes code."'

In [22]:
#sorting the index alphabetically

df_schema.sort_index(ascending=True,inplace=True)

## Filtering DataFrames

In [23]:
#Getting rows where country = India
indiaFilter :pd.core.series.Series =  (df['Country'] == 'India')
#Use loc instead of indiansDF[indiaFilter], so that we can select columns in better way
indiansDf :pd.core.frame.DataFrame = df.loc[indiaFilter,['Country','Age']]
indiansDf


Unnamed: 0_level_0,Country,Age
Respondent,Unnamed: 1_level_1,Unnamed: 2_level_1
7,India,
22,India,
58,India,
63,India,21.0
149,India,36.0
...,...,...
54757,India,
55407,India,
62464,India,
62954,India,


`combining multiple filters`

In [24]:
indianAndOldFilter = (df['Country'] == 'India') & (df['Age'] > 40)

In [25]:
df.loc[indianAndOldFilter,['Country','Age']]

Unnamed: 0_level_0,Country,Age
Respondent,Unnamed: 1_level_1,Unnamed: 2_level_1
1158,India,45.0
1816,India,43.0
4437,India,42.0
4553,India,43.0
5604,India,59.0
...,...,...
55221,India,48.0
58030,India,99.0
61003,India,44.0
61133,India,43.0


`use '~' before filter to negate the full filter!`

In [26]:
df_schema.loc['ConvertedComp','QuestionText']

'Salary converted to annual USD salaries using the exchange rate on 2020-02-19, assuming 12 working months and 50 working weeks.'

In [27]:
highSalaryFilter = (df['ConvertedComp'] > 50000)

In [28]:
highPaidDF = df.loc[highSalaryFilter,['ConvertedComp','Country','LanguageWorkedWith']].sort_values(by='ConvertedComp',ascending=False)
highPaidDF

Unnamed: 0_level_0,ConvertedComp,Country,LanguageWorkedWith
Respondent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
53716,2000000.0,United States,Bash/Shell/PowerShell;JavaScript;Python;Swift
818,2000000.0,United States,Bash/Shell/PowerShell;HTML/CSS;Java;Python;SQL...
53874,2000000.0,United States,JavaScript;PHP;Ruby
60832,2000000.0,United States,HTML/CSS;JavaScript
53854,2000000.0,United States,Bash/Shell/PowerShell;C#;HTML/CSS;Java;JavaScr...
...,...,...,...
14760,50028.0,Hungary,Bash/Shell/PowerShell;C;C++;HTML/CSS;Java;Java...
57240,50016.0,Russian Federation,Bash/Shell/PowerShell;C#;HTML/CSS;JavaScript;P...
45123,50016.0,Russian Federation,Bash/Shell/PowerShell;Python;SQL
48393,50016.0,Russian Federation,Python;SQL


In [29]:
#highest paid in these countries
countries = ['India','Germany']
highSalaryFilter = (df['ConvertedComp'] > 50000)  & (df['Country'].isin(countries))
    
highPaidDF = df.loc[highSalaryFilter,['ConvertedComp','Country','LanguageWorkedWith']].sort_values(by='ConvertedComp',ascending=False)
highPaidDF

Unnamed: 0_level_0,ConvertedComp,Country,LanguageWorkedWith
Respondent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
37655,1800000.0,India,C#;Dart;Java;Objective-C;Python;SQL;Swift
36976,1600000.0,India,
44359,1200000.0,Germany,
27358,1000000.0,Germany,
8457,1000000.0,India,Assembly;C;C++;HTML/CSS;Java;Python;SQL
...,...,...,...
7299,50262.0,India,Bash/Shell/PowerShell;C++;Go;Python;Ruby
27840,50262.0,India,JavaScript
62113,50262.0,India,Bash/Shell/PowerShell;Go;HTML/CSS;JavaScript;P...
13979,50157.0,Germany,Bash/Shell/PowerShell;Java;Kotlin;Objective-C;...


In [30]:
df['LanguageWorkedWith'] #looks like these are comma seperated.
devsWorkedWithJavaPredicate =  df['LanguageWorkedWith'].str.contains('C++',na=False,regex= False)
df.loc[devsWorkedWithJavaPredicate,'LanguageWorkedWith']


Respondent
23                            Bash/Shell/PowerShell;C#;C++
27             Bash/Shell/PowerShell;C;C++;Java;Python;SQL
29                                  C#;C++;HTML/CSS;Python
31                                                   C;C++
32                               C;C++;Java;Python;SQL;VBA
                               ...                        
63077              C++;HTML/CSS;Java;JavaScript;Python;SQL
63452    Bash/Shell/PowerShell;C#;C++;HTML/CSS;JavaScri...
63640    Bash/Shell/PowerShell;C;C#;C++;HTML/CSS;Java;J...
64330                                         C++;HTML/CSS
64867    Assembly;Bash/Shell/PowerShell;C;C#;C++;Dart;G...
Name: LanguageWorkedWith, Length: 13707, dtype: object

## Altering Rows / Columns

`Dropping Rows`

In [31]:
demoDF = pd.DataFrame(
{
    'full_name' : ['Abhishek Sengupta','Shredder Wayne','Queen Wayne'],
    'occupation' : ['Student','Soldier','Student']
}
)

#demoDF.drop(index=1) #single items


In [32]:
#dropping useing filter
dropFilter= (demoDF['occupation'] == 'Student')
print(demoDF[dropFilter]) #DF containing items that matched the predicate
print('\n')
print(demoDF[dropFilter].index) # Just index numbers

demoDF.drop(index= demoDF[dropFilter].index)


           full_name occupation
0  Abhishek Sengupta    Student
2        Queen Wayne    Student


Int64Index([0, 2], dtype='int64')


Unnamed: 0,full_name,occupation
1,Shredder Wayne,Soldier


`Altering Column Headers`

In [33]:
#using for comprehensions
df.columns = [column.upper() for column in df.columns]

In [34]:
# df.columns = df.columns.str.replace('_','__')

`renaming selective columns using **dictionary**`

In [35]:
df.rename(columns={'AGE' : 'AGE_IN_YEAR','HOBBYIST':'IS_HOBBYIST'},inplace=True)

`Update RoW`

In [36]:
#single row 

df.loc[1,'AGE_IN_YEAR'] = 50

# OR

df.at[1,'AGE_IN_YEAR'] = 50

In [37]:
#multiple row 

df.loc[1,['AGE_IN_YEAR','COUNTRY']] = [60,'AFGANISTAN']




In [38]:
indianFilter = (df['COUNTRY'] == 'India')

In [39]:
df.loc[indiaFilter,'AGE_IN_YEAR']


Respondent
7         NaN
22        NaN
58        NaN
63       21.0
149      36.0
         ... 
54757     NaN
55407     NaN
62464     NaN
62954     NaN
64236     NaN
Name: AGE_IN_YEAR, Length: 8403, dtype: float64

In [40]:
# increasing AGE only for indian
df.loc[indiaFilter,'AGE_IN_YEAR'] = df.loc[indiaFilter,'AGE_IN_YEAR'] + 1

`Lowercasing all WelcomeChange Fields`

In [41]:
df_schema.loc['WelcomeChange','QuestionText']

'Compared to last year, how welcome do you feel on Stack Overflow?'

In [42]:
df['WELCOMECHANGE'] = df['WELCOMECHANGE'].str.lower()

`Splitting a column to from new columns`

In [43]:
demoDF = pd.DataFrame(
{
    'full_name' : ['Abhishek Sengupta','Shredder Wayne'],
    'occupation' : ['Student','Soldier']
}
)


In [44]:
demoDF['full_name'].str.split(' ') # returns comma seperated list
demoDF['full_name'].str.split(' ',expand = True) # this returns new columns 

Unnamed: 0,0,1
0,Abhishek,Sengupta
1,Shredder,Wayne


In [45]:
#adding new column to existing demoDF
demoDF[['firstName','lastName']] = demoDF['full_name'].str.split(' ',expand = True)
demoDF

Unnamed: 0,full_name,occupation,firstName,lastName
0,Abhishek Sengupta,Student,Abhishek,Sengupta
1,Shredder Wayne,Soldier,Shredder,Wayne


## `Important Functions`


`Apply`

In [46]:
# Using apply() for updating specific row contents

def capFirstLetter(word : str) -> str :
    return word.title()

df_schema['QuestionText'].apply(capFirstLetter) 

Column
Age                   What Is Your Age (In Years)? If You Prefer Not...
Age1stCode            At What Age Did You Write Your First Line Of C...
CompFreq               Is That Compensation Weekly, Monthly, Or Yearly?
CompTotal             What Is Your Current Total Compensation (Salar...
ConvertedComp         Salary Converted To Annual Usd Salaries Using ...
                                            ...                        
WebframeWorkedWith    Which Web Frameworks Have You Done Extensive D...
WelcomeChange         Compared To Last Year, How Welcome Do You Feel...
WorkWeekHrs           On Average, How Many Hours Per Week Do You Wor...
YearsCode             Including Any Education, How Many Years Have Y...
YearsCodePro          Not Including Education, How Many Years Have Y...
Name: QuestionText, Length: 61, dtype: object

In [47]:
#Using Lambda

df_schema['QuestionText'].apply(lambda word: word.title()) 


Column
Age                   What Is Your Age (In Years)? If You Prefer Not...
Age1stCode            At What Age Did You Write Your First Line Of C...
CompFreq               Is That Compensation Weekly, Monthly, Or Yearly?
CompTotal             What Is Your Current Total Compensation (Salar...
ConvertedComp         Salary Converted To Annual Usd Salaries Using ...
                                            ...                        
WebframeWorkedWith    Which Web Frameworks Have You Done Extensive D...
WelcomeChange         Compared To Last Year, How Welcome Do You Feel...
WorkWeekHrs           On Average, How Many Hours Per Week Do You Wor...
YearsCode             Including Any Education, How Many Years Have Y...
YearsCodePro          Not Including Education, How Many Years Have Y...
Name: QuestionText, Length: 61, dtype: object

In [48]:
#creating new Row by passing lambda to apply method
df_schema['QuestionText Length'] = df_schema['QuestionText'].apply(lambda word: len(word) )

##### Using apply on DF

In [49]:
df.apply(len,axis='rows')

MAINBRANCH                      64461
IS_HOBBYIST                     64461
AGE_IN_YEAR                     64461
AGE1STCODE                      64461
COMPFREQ                        64461
COMPTOTAL                       64461
CONVERTEDCOMP                   64461
COUNTRY                         64461
CURRENCYDESC                    64461
CURRENCYSYMBOL                  64461
DATABASEDESIRENEXTYEAR          64461
DATABASEWORKEDWITH              64461
DEVTYPE                         64461
EDLEVEL                         64461
EMPLOYMENT                      64461
ETHNICITY                       64461
GENDER                          64461
JOBFACTORS                      64461
JOBSAT                          64461
JOBSEEK                         64461
LANGUAGEDESIRENEXTYEAR          64461
LANGUAGEWORKEDWITH              64461
MISCTECHDESIRENEXTYEAR          64461
MISCTECHWORKEDWITH              64461
NEWCOLLABTOOLSDESIRENEXTYEAR    64461
NEWCOLLABTOOLSWORKEDWITH        64461
NEWDEVOPS   

In [50]:
df.apply(len,axis='rows')

MAINBRANCH                      64461
IS_HOBBYIST                     64461
AGE_IN_YEAR                     64461
AGE1STCODE                      64461
COMPFREQ                        64461
COMPTOTAL                       64461
CONVERTEDCOMP                   64461
COUNTRY                         64461
CURRENCYDESC                    64461
CURRENCYSYMBOL                  64461
DATABASEDESIRENEXTYEAR          64461
DATABASEWORKEDWITH              64461
DEVTYPE                         64461
EDLEVEL                         64461
EMPLOYMENT                      64461
ETHNICITY                       64461
GENDER                          64461
JOBFACTORS                      64461
JOBSAT                          64461
JOBSEEK                         64461
LANGUAGEDESIRENEXTYEAR          64461
LANGUAGEWORKEDWITH              64461
MISCTECHDESIRENEXTYEAR          64461
MISCTECHWORKEDWITH              64461
NEWCOLLABTOOLSDESIRENEXTYEAR    64461
NEWCOLLABTOOLSWORKEDWITH        64461
NEWDEVOPS   

In [51]:
df.apply(len,axis='columns')

Respondent
1        60
2        60
3        60
4        60
5        60
         ..
64858    60
64867    60
64898    60
64925    60
65112    60
Length: 64461, dtype: int64

`Map`

In [52]:
#Apply map donesnt work on Series, so using MAP

#FOr using map , supply exhaustive combinations else some values might turn NaN if corrosponding value is not found in the dict
df['IS_HOBBYIST'].map({'Yes':True,'No' :False}) 

#Use Replace to to selectively update
df['IS_HOBBYIST'].replace({'Yes':True,'No' :False}) 



Respondent
1         True
2        False
3         True
4         True
5         True
         ...  
64858     True
64867     True
64898     True
64925     True
65112     True
Name: IS_HOBBYIST, Length: 64461, dtype: object

## Sorting

In [53]:
df.sort_values(by=['AGE1STCODE','AGE_IN_YEAR'],ascending=[True,False])


Unnamed: 0_level_0,MAINBRANCH,IS_HOBBYIST,AGE_IN_YEAR,AGE1STCODE,COMPFREQ,COMPTOTAL,CONVERTEDCOMP,COUNTRY,CURRENCYDESC,CURRENCYSYMBOL,...,SURVEYEASE,SURVEYLENGTH,TRANS,UNDERGRADMAJOR,WEBFRAMEDESIRENEXTYEAR,WEBFRAMEWORKEDWITH,WELCOMECHANGE,WORKWEEKHRS,YEARSCODE,YEARSCODEPRO
Respondent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
58030,I am a student who is learning to code,Yes,100.0,10,,,,India,,,...,Difficult,Too long,No,"Computer science, computer engineering, or sof...",,jQuery,a lot more welcome now than last year,,6,
5950,I am a developer by profession,Yes,99.0,10,Monthly,2500.0,30000.0,South Korea,United States dollar,USD,...,Easy,Appropriate in length,,"Computer science, computer engineering, or sof...",React.js;Ruby on Rails,React.js;Ruby on Rails,just as welcome now as i felt last year,45.0,15,8
30043,I am a student who is learning to code,Yes,69.0,10,,,,Poland,,,...,Easy,Appropriate in length,Yes,,Django;Flask,ASP.NET;Django;Express;Flask,just as welcome now as i felt last year,,2,
34677,I am a developer by profession,Yes,65.0,10,Yearly,120000.0,120000.0,United States,United States dollar,USD,...,Easy,Appropriate in length,No,"Computer science, computer engineering, or sof...",,,a lot less welcome now than last year,60.0,More than 50 years,More than 50 years
61238,I am a developer by profession,No,60.0,10,Yearly,175000.0,175000.0,United States,United States dollar,USD,...,Easy,Appropriate in length,No,"Computer science, computer engineering, or sof...",Laravel;Vue.js,jQuery;Laravel;Spring;Vue.js,just as welcome now as i felt last year,55.0,45,37
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64567,,Yes,,,,,,Netherlands,,,...,,,,,,ASP.NET Core,,,,
64867,,Yes,,,,,,Morocco,,,...,,,,,,,,,,
64898,,Yes,,,,,,Viet Nam,,,...,,,,,,,,,,
64925,,Yes,,,,,,Poland,,,...,,,,,Angular;Angular.js;React.js,,,,,


In [54]:
df.sort_values(by=['COUNTRY','CONVERTEDCOMP'],ascending=[True,False])[['COUNTRY','CONVERTEDCOMP']].head(100)

Unnamed: 0_level_0,COUNTRY,CONVERTEDCOMP
Respondent,Unnamed: 1_level_1,Unnamed: 2_level_1
1,AFGANISTAN,
62788,Afghanistan,1000000.0
65381,Afghanistan,1000000.0
65456,Afghanistan,1000000.0
38672,Afghanistan,231192.0
...,...,...
28900,Albania,15900.0
51520,Albania,15900.0
23498,Albania,13776.0
37526,Albania,12972.0


In [55]:
# nlargest salary
df['CONVERTEDCOMP'].nlargest(150)

Respondent
123      2000000.0
125      2000000.0
193      2000000.0
665      2000000.0
699      2000000.0
           ...    
33745    2000000.0
33787    2000000.0
33865    2000000.0
33886    2000000.0
34610    2000000.0
Name: CONVERTEDCOMP, Length: 150, dtype: float64

## Basic Analysis

In [56]:
df.columns

Index(['MAINBRANCH', 'IS_HOBBYIST', 'AGE_IN_YEAR', 'AGE1STCODE', 'COMPFREQ',
       'COMPTOTAL', 'CONVERTEDCOMP', 'COUNTRY', 'CURRENCYDESC',
       'CURRENCYSYMBOL', 'DATABASEDESIRENEXTYEAR', 'DATABASEWORKEDWITH',
       'DEVTYPE', 'EDLEVEL', 'EMPLOYMENT', 'ETHNICITY', 'GENDER', 'JOBFACTORS',
       'JOBSAT', 'JOBSEEK', 'LANGUAGEDESIRENEXTYEAR', 'LANGUAGEWORKEDWITH',
       'MISCTECHDESIRENEXTYEAR', 'MISCTECHWORKEDWITH',
       'NEWCOLLABTOOLSDESIRENEXTYEAR', 'NEWCOLLABTOOLSWORKEDWITH', 'NEWDEVOPS',
       'NEWDEVOPSIMPT', 'NEWEDIMPT', 'NEWJOBHUNT', 'NEWJOBHUNTRESEARCH',
       'NEWLEARN', 'NEWOFFTOPIC', 'NEWONBOARDGOOD', 'NEWOTHERCOMMS',
       'NEWOVERTIME', 'NEWPURCHASERESEARCH', 'NEWPURPLELINK', 'NEWSOSITES',
       'NEWSTUCK', 'OPSYS', 'ORGSIZE', 'PLATFORMDESIRENEXTYEAR',
       'PLATFORMWORKEDWITH', 'PURCHASEWHAT', 'SEXUALITY', 'SOACCOUNT',
       'SOCOMM', 'SOPARTFREQ', 'SOVISITFREQ', 'SURVEYEASE', 'SURVEYLENGTH',
       'TRANS', 'UNDERGRADMAJOR', 'WEBFRAMEDESIRENEXTYEAR',
     

In [57]:
df.rename(columns={'CONVERTEDCOMP' : 'salary'},inplace=True)

In [58]:
#Median Salary, Ignores NaN
df['salary'].median()

54049.0

In [59]:
#getting median for all numeric columns in DF
df.median()

AGE_IN_YEAR       29.0
COMPTOTAL      63000.0
salary         54049.0
WORKWEEKHRS       40.0
dtype: float64

In [60]:
df.describe()

Unnamed: 0,AGE_IN_YEAR,COMPTOTAL,salary,WORKWEEKHRS
count,45447.0,34826.0,34756.0,41151.0
mean,30.931855,3.190464e+242,103756.1,40.782174
std,9.541354,inf,226885.3,17.816383
min,1.0,0.0,0.0,1.0
25%,24.0,20000.0,24648.0,40.0
50%,29.0,63000.0,54049.0,40.0
75%,35.0,125000.0,95000.0,44.0
max,279.0,1.1111110000000001e+247,2000000.0,475.0


In [61]:
df['GENDER'].value_counts()

Man                                                            46013
Woman                                                           3844
Non-binary, genderqueer, or gender non-conforming                385
Man;Non-binary, genderqueer, or gender non-conforming            121
Woman;Non-binary, genderqueer, or gender non-conforming           92
Woman;Man                                                         76
Woman;Man;Non-binary, genderqueer, or gender non-conforming       26
Name: GENDER, dtype: int64

In [62]:
#in % form
df['GENDER'].value_counts(normalize=True) * 100

Man                                                            91.012125
Woman                                                           7.603299
Non-binary, genderqueer, or gender non-conforming               0.761517
Man;Non-binary, genderqueer, or gender non-conforming           0.239334
Woman;Non-binary, genderqueer, or gender non-conforming         0.181973
Woman;Man                                                       0.150325
Woman;Man;Non-binary, genderqueer, or gender non-conforming     0.051427
Name: GENDER, dtype: float64

In [63]:
df['COUNTRY'].value_counts(normalize=True) * 100

United States       19.460919
India               13.114933
United Kingdom       6.080659
Germany              6.069734
Canada               3.419590
                      ...    
Marshall Islands     0.001561
North Korea          0.001561
Saint Lucia          0.001561
Gabon                0.001561
Mali                 0.001561
Name: COUNTRY, Length: 184, dtype: float64

`GROUP BY`

In [64]:
byCountry : pd.core.groupby.generic.DataFrameGroupBy = df.groupby(['COUNTRY'])

In [65]:
byCountry['IS_HOBBYIST'].value_counts()

COUNTRY      IS_HOBBYIST
AFGANISTAN   Yes             1
Afghanistan  Yes            64
             No             20
Albania      Yes            44
             No             10
                            ..
Yemen        No              1
Zambia       Yes            14
             No              7
Zimbabwe     Yes            23
             No              8
Name: IS_HOBBYIST, Length: 334, dtype: int64

In [66]:
byCountry['IS_HOBBYIST'].value_counts().loc['India']

IS_HOBBYIST
Yes    6645
No     1758
Name: IS_HOBBYIST, dtype: int64

In [67]:
byCountry['IS_HOBBYIST'].value_counts().loc[['India','United States']]

COUNTRY        IS_HOBBYIST
India          Yes            6645
               No             1758
United States  Yes            9718
               No             2751
Name: IS_HOBBYIST, dtype: int64

In [68]:
byCountry['salary'].median().loc[['India','United States','Germany']]

COUNTRY
India             10056.0
United States    115000.0
Germany           62697.0
Name: salary, dtype: float64

In [69]:
byCountry['salary'].agg(['mean','median','std']).loc[['India','United States','Germany']]

Unnamed: 0_level_0,mean,median,std
COUNTRY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
India,28751.271364,10056.0,93017.527192
United States,208826.496443,115000.0,372924.964876
Germany,92508.647339,62697.0,144504.523953


#### `KNOWS PYTHON AND COUNTRY WISE PERCENTAGE`

In [70]:
country_uses_python = byCountry['LANGUAGEWORKEDWITH'].apply(lambda series: series.str.contains('Python').sum())

In [71]:
num_res = df['COUNTRY'].value_counts()

In [72]:
num_res


United States       12469
India                8403
United Kingdom       3896
Germany              3889
Canada               2191
                    ...  
Marshall Islands        1
North Korea             1
Saint Lucia             1
Gabon                   1
Mali                    1
Name: COUNTRY, Length: 184, dtype: int64

In [73]:
knowsPython = pd.concat([num_res,country_uses_python],axis='columns',sort=False)

In [74]:
knowsPython

Unnamed: 0,COUNTRY,LANGUAGEWORKEDWITH
United States,12469,5964
India,8403,2670
United Kingdom,3896,1621
Germany,3889,1712
Canada,2191,1011
...,...,...
Marshall Islands,1,0
North Korea,1,0
Saint Lucia,1,1
Gabon,1,1


In [75]:
knowsPython['percentage'] = knowsPython['LANGUAGEWORKEDWITH']/knowsPython['COUNTRY']

In [76]:
knowsPython.rename(columns={'COUNTRY' : 'total_respondents',
                           'LANGUAGEWORKEDWITH':'knows_python'
                           }
                  )

Unnamed: 0,total_respondents,knows_python,percentage
United States,12469,5964,0.478306
India,8403,2670,0.317744
United Kingdom,3896,1621,0.416068
Germany,3889,1712,0.440216
Canada,2191,1011,0.461433
...,...,...,...
Marshall Islands,1,0,0.000000
North Korea,1,0,0.000000
Saint Lucia,1,1,1.000000
Gabon,1,1,1.000000


`Shortcut`

In [77]:
countery_python_percentage = byCountry['LANGUAGEWORKEDWITH'].apply(lambda series: (series.str.contains('Python').sum()/len(series)) *100 ).sort_values(ascending=False)

In [78]:
countery_python_percentage.head(50)

COUNTRY
Saint Lucia                             100.000000
Gabon                                   100.000000
Micronesia, Federated States of...      100.000000
Montenegro                               69.230769
Brunei Darussalam                        66.666667
Guyana                                   60.000000
Swaziland                                57.142857
Iceland                                  52.830189
Finland                                  52.435530
Uganda                                   51.851852
Mauritania                               50.000000
Timor-Leste                              50.000000
Burkina Faso                             50.000000
Belize                                   50.000000
Jamaica                                  50.000000
Solomon Islands                          50.000000
Papua New Guinea                         50.000000
Cameroon                                 48.000000
United States                            47.830620
Israel                 

In [79]:
countery_python_percentage.loc[['India','China','Germany']]

COUNTRY
India      31.774366
China      42.587601
Germany    44.021599
Name: LANGUAGEWORKEDWITH, dtype: float64

In [81]:

#Finding mean age of coders, but mean works with only numeric data types, currently its a string
df['YEARSCODE'].dtype

dtype('O')

In [88]:
df['YEARSCODE'].unique()

array([36. ,  7. ,  4. , 15. ,  6. , 17. ,  8. , 10. , 35. ,  5. , 37. ,
       19. ,  9. , 22. , 30. , 23. , 20. ,  2. ,  0.5,  3. , 13. , 25. ,
       16. , 43. , 11. , 38. , 33. ,  nan, 24. , 21. , 12. , 40. , 27. ,
       50. , 46. , 14. , 18. , 28. , 32. , 44. , 26. , 42. , 31. , 34. ,
       29. ,  1. , 39. , 41. , 45. , 55. , 47. , 49. , 48. ])

In [86]:
#Reason for taking Float instead of Int is, there are some NaN values and NaN is acutally Float.
df['YEARSCODE'].replace('Less than 1 year',0.5,inplace = True)
df['YEARSCODE'].replace('More than 50 years',55,inplace = True)

df['YEARSCODE'] = df['YEARSCODE'].astype(float)

In [87]:
df['YEARSCODE'].mean()

12.7255304763886