# Basic Operations on Stackoverflow Survey Dataset

In [1]:
import pandas as pd

In [48]:
df = pd.read_csv('data\survey_results_public.csv')
df_schema = pd.read_csv('data\survey_results_schema.csv')

In [50]:
print(df.shape)
print(df_schema.shape)

(64461, 61)
(61, 2)


In [52]:
df.head()

Unnamed: 0,Respondent,MainBranch,Hobbyist,Age,Age1stCode,CompFreq,CompTotal,ConvertedComp,Country,CurrencyDesc,...,SurveyEase,SurveyLength,Trans,UndergradMajor,WebframeDesireNextYear,WebframeWorkedWith,WelcomeChange,WorkWeekHrs,YearsCode,YearsCodePro
0,1,I am a developer by profession,Yes,,13,Monthly,,,Germany,European Euro,...,Neither easy nor difficult,Appropriate in length,No,"Computer science, computer engineering, or sof...",ASP.NET Core,ASP.NET;ASP.NET Core,Just as welcome now as I felt last year,50.0,36,27.0
1,2,I am a developer by profession,No,,19,,,,United Kingdom,Pound sterling,...,,,,"Computer science, computer engineering, or sof...",,,Somewhat more welcome now than last year,,7,4.0
2,3,I code primarily as a hobby,Yes,,15,,,,Russian Federation,,...,Neither easy nor difficult,Appropriate in length,,,,,Somewhat more welcome now than last year,,4,
3,4,I am a developer by profession,Yes,25.0,18,,,,Albania,Albanian lek,...,,,No,"Computer science, computer engineering, or sof...",,,Somewhat less welcome now than last year,40.0,7,4.0
4,5,"I used to be a developer by profession, but no...",Yes,31.0,16,,,,United States,,...,Easy,Too short,No,"Computer science, computer engineering, or sof...",Django;Ruby on Rails,Ruby on Rails,Just as welcome now as I felt last year,,15,8.0


In [53]:
df_schema.head()

Unnamed: 0,Column,QuestionText
0,Respondent,Randomized respondent ID number (not in order ...
1,MainBranch,Which of the following options best describes ...
2,Hobbyist,Do you code as a hobby?
3,Age,What is your age (in years)? If you prefer not...
4,Age1stCode,At what age did you write your first line of c...


## Selecting Column(s) 

In [54]:
#Single Selection returns a Series
df['Age']

0         NaN
1         NaN
2         NaN
3        25.0
4        31.0
         ... 
64456     NaN
64457     NaN
64458     NaN
64459     NaN
64460     NaN
Name: Age, Length: 64461, dtype: float64

In [55]:
# Multi Selection returns new DF
df[['Age','Hobbyist']]


Unnamed: 0,Age,Hobbyist
0,,Yes
1,,No
2,,Yes
3,25.0,Yes
4,31.0,Yes
...,...,...
64456,,Yes
64457,,Yes
64458,,Yes
64459,,Yes


In [56]:
# df.columns returns list of columns, so using range passes list of filtered columns names
df[df.columns[1:3]]


Unnamed: 0,MainBranch,Hobbyist
0,I am a developer by profession,Yes
1,I am a developer by profession,No
2,I code primarily as a hobby,Yes
3,I am a developer by profession,Yes
4,"I used to be a developer by profession, but no...",Yes
...,...,...
64456,,Yes
64457,,Yes
64458,,Yes
64459,,Yes


## Selecting Row(s) using `range`

In [57]:
df[1:3]

Unnamed: 0,Respondent,MainBranch,Hobbyist,Age,Age1stCode,CompFreq,CompTotal,ConvertedComp,Country,CurrencyDesc,...,SurveyEase,SurveyLength,Trans,UndergradMajor,WebframeDesireNextYear,WebframeWorkedWith,WelcomeChange,WorkWeekHrs,YearsCode,YearsCodePro
1,2,I am a developer by profession,No,,19,,,,United Kingdom,Pound sterling,...,,,,"Computer science, computer engineering, or sof...",,,Somewhat more welcome now than last year,,7,4.0
2,3,I code primarily as a hobby,Yes,,15,,,,Russian Federation,,...,Neither easy nor difficult,Appropriate in length,,,,,Somewhat more welcome now than last year,,4,


## Selecting Row And Column using `loc`

`When using loc , the last value is *inclusive*`

In [58]:
#Selection of row 0
df.loc[0]

Respondent                                                  1
MainBranch                     I am a developer by profession
Hobbyist                                                  Yes
Age                                                       NaN
Age1stCode                                                 13
                                       ...                   
WebframeWorkedWith                       ASP.NET;ASP.NET Core
WelcomeChange         Just as welcome now as I felt last year
WorkWeekHrs                                                50
YearsCode                                                  36
YearsCodePro                                               27
Name: 0, Length: 61, dtype: object

In [59]:
#Selection of row 0,1,2 and with Age and COuntry Column
df.loc[[0,1,2],['Age','Country']]

Unnamed: 0,Age,Country
0,,Germany
1,,United Kingdom
2,,Russian Federation


In [60]:
#Selection of row of range 0 to 10 and with Age and COuntry Column
#When using slice, dont use []
df.loc[0:10,['Age','Country']]

Unnamed: 0,Age,Country
0,,Germany
1,,United Kingdom
2,,Russian Federation
3,25.0,Albania
4,31.0,United States
5,,Germany
6,,India
7,36.0,United States
8,30.0,Tunisia
9,22.0,United Kingdom


In [61]:
df.loc[ 0:10, 'Hobbyist' : 'Country' ]

Unnamed: 0,Hobbyist,Age,Age1stCode,CompFreq,CompTotal,ConvertedComp,Country
0,Yes,,13,Monthly,,,Germany
1,No,,19,,,,United Kingdom
2,Yes,,15,,,,Russian Federation
3,Yes,25.0,18,,,,Albania
4,Yes,31.0,16,,,,United States
5,No,,14,,,,Germany
6,Yes,,18,Monthly,,,India
7,Yes,36.0,12,Yearly,116000.0,116000.0,United States
8,No,30.0,20,,,,Tunisia
9,Yes,22.0,14,Yearly,25000.0,32315.0,United Kingdom


In [62]:
df.loc[ 0:5, df.columns[2:6] ]

Unnamed: 0,Hobbyist,Age,Age1stCode,CompFreq
0,Yes,,13,Monthly
1,No,,19,
2,Yes,,15,
3,Yes,25.0,18,
4,Yes,31.0,16,
5,No,,14,


### Experiments

In [67]:
#Getting only hobbyist column
df['Hobbyist']

0        Yes
1         No
2        Yes
3        Yes
4        Yes
        ... 
64456    Yes
64457    Yes
64458    Yes
64459    Yes
64460    Yes
Name: Hobbyist, Length: 64461, dtype: object

In [71]:
df.loc[0:2,'Hobbyist':'Employment']

Unnamed: 0,Hobbyist,Age,Age1stCode,CompFreq,CompTotal,ConvertedComp,Country,CurrencyDesc,CurrencySymbol,DatabaseDesireNextYear,DatabaseWorkedWith,DevType,EdLevel,Employment
0,Yes,,13,Monthly,,,Germany,European Euro,EUR,Microsoft SQL Server,Elasticsearch;Microsoft SQL Server;Oracle,"Developer, desktop or enterprise applications;...","Master’s degree (M.A., M.S., M.Eng., MBA, etc.)","Independent contractor, freelancer, or self-em..."
1,No,,19,,,,United Kingdom,Pound sterling,GBP,,,"Developer, full-stack;Developer, mobile","Bachelor’s degree (B.A., B.S., B.Eng., etc.)",Employed full-time
2,Yes,,15,,,,Russian Federation,,,,,,,


In [74]:
df.columns

Index(['Respondent', 'MainBranch', 'Hobbyist', 'Age', 'Age1stCode', 'CompFreq',
       'CompTotal', 'ConvertedComp', 'Country', 'CurrencyDesc',
       'CurrencySymbol', 'DatabaseDesireNextYear', 'DatabaseWorkedWith',
       'DevType', 'EdLevel', 'Employment', 'Ethnicity', 'Gender', 'JobFactors',
       'JobSat', 'JobSeek', 'LanguageDesireNextYear', 'LanguageWorkedWith',
       'MiscTechDesireNextYear', 'MiscTechWorkedWith',
       'NEWCollabToolsDesireNextYear', 'NEWCollabToolsWorkedWith', 'NEWDevOps',
       'NEWDevOpsImpt', 'NEWEdImpt', 'NEWJobHunt', 'NEWJobHuntResearch',
       'NEWLearn', 'NEWOffTopic', 'NEWOnboardGood', 'NEWOtherComms',
       'NEWOvertime', 'NEWPurchaseResearch', 'NEWPurpleLink', 'NEWSOSites',
       'NEWStuck', 'OpSys', 'OrgSize', 'PlatformDesireNextYear',
       'PlatformWorkedWith', 'PurchaseWhat', 'Sexuality', 'SOAccount',
       'SOComm', 'SOPartFreq', 'SOVisitFreq', 'SurveyEase', 'SurveyLength',
       'Trans', 'UndergradMajor', 'WebframeDesireNextYear',
  

In [75]:
#Setting new Index for the DF

# or df = pd.read_csv('data\survey_results_public.csv',index_col ='Respondent')

df.set_index('Respondent' , inplace=True)

In [None]:
#Reset Index
#df.reset_index(inplace=True)

In [81]:
#For looking up of that each column meant
df_schema.set_index('Column',inplace=True)


In [86]:
# getting what a column meant

df_schema.loc['MainBranch','QuestionText']

'Which of the following options best describes you today? Here, by "developer" we mean "someone who writes code."'

In [88]:
#sorting the index alphabetically

df_schema.sort_index(ascending=True,inplace=True)

## Filtering DataFrames

In [120]:
#Getting rows where country = India
indiaFilter :pd.core.series.Series =  (df['Country'] == 'India')
#Use loc instead of indiansDF[indiaFilter], so that we can select columns in better way
indiansDf :pd.core.frame.DataFrame = df.loc[indiaFilter,['Country','Age']]
indiansDf


Unnamed: 0_level_0,Country,Age
Respondent,Unnamed: 1_level_1,Unnamed: 2_level_1
7,India,
22,India,
58,India,
63,India,21.0
149,India,36.0
...,...,...
54757,India,
55407,India,
62464,India,
62954,India,


`combining multiple filters`

In [121]:
indianAndOldFilter = (df['Country'] == 'India') & (df['Age'] > 40)

In [124]:
df.loc[indianAndOldFilter,['Country','Age']]

Unnamed: 0_level_0,Country,Age
Respondent,Unnamed: 1_level_1,Unnamed: 2_level_1
1158,India,45.0
1816,India,43.0
4437,India,42.0
4553,India,43.0
5604,India,59.0
...,...,...
55221,India,48.0
58030,India,99.0
61003,India,44.0
61133,India,43.0


`use '~' before filter to negate the full filter!`

In [125]:
df_schema.loc['ConvertedComp','QuestionText']

'Salary converted to annual USD salaries using the exchange rate on 2020-02-19, assuming 12 working months and 50 working weeks.'

In [127]:
highSalaryFilter = (df['ConvertedComp'] > 50000)

In [137]:
highPaidDF = df.loc[highSalaryFilter,['ConvertedComp','Country','LanguageWorkedWith']].sort_values(by='ConvertedComp',ascending=False)
highPaidDF

Unnamed: 0_level_0,ConvertedComp,Country,LanguageWorkedWith
Respondent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
53716,2000000.0,United States,Bash/Shell/PowerShell;JavaScript;Python;Swift
818,2000000.0,United States,Bash/Shell/PowerShell;HTML/CSS;Java;Python;SQL...
53874,2000000.0,United States,JavaScript;PHP;Ruby
60832,2000000.0,United States,HTML/CSS;JavaScript
53854,2000000.0,United States,Bash/Shell/PowerShell;C#;HTML/CSS;Java;JavaScr...
...,...,...,...
14760,50028.0,Hungary,Bash/Shell/PowerShell;C;C++;HTML/CSS;Java;Java...
57240,50016.0,Russian Federation,Bash/Shell/PowerShell;C#;HTML/CSS;JavaScript;P...
45123,50016.0,Russian Federation,Bash/Shell/PowerShell;Python;SQL
48393,50016.0,Russian Federation,Python;SQL


In [139]:
#highest paid in these countries
countries = ['India','Germany']
highSalaryFilter = (df['ConvertedComp'] > 50000)  & df['Country'].isin(countries))
    
highPaidDF = df.loc[highSalaryFilter,['ConvertedComp','Country','LanguageWorkedWith']].sort_values(by='ConvertedComp',ascending=False)
highPaidDF

Unnamed: 0_level_0,ConvertedComp,Country,LanguageWorkedWith
Respondent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
37655,1800000.0,India,C#;Dart;Java;Objective-C;Python;SQL;Swift
36976,1600000.0,India,
44359,1200000.0,Germany,
27358,1000000.0,Germany,
8457,1000000.0,India,Assembly;C;C++;HTML/CSS;Java;Python;SQL
...,...,...,...
7299,50262.0,India,Bash/Shell/PowerShell;C++;Go;Python;Ruby
27840,50262.0,India,JavaScript
62113,50262.0,India,Bash/Shell/PowerShell;Go;HTML/CSS;JavaScript;P...
13979,50157.0,Germany,Bash/Shell/PowerShell;Java;Kotlin;Objective-C;...


In [149]:
df['LanguageWorkedWith'] #looks like these are comma seperated.
devsWorkedWithJavaPredicate =  df['LanguageWorkedWith'].str.contains('C++',na=False,regex= False)
df.loc[devsWorkedWithJavaPredicate,'LanguageWorkedWith']


Respondent
23                            Bash/Shell/PowerShell;C#;C++
27             Bash/Shell/PowerShell;C;C++;Java;Python;SQL
29                                  C#;C++;HTML/CSS;Python
31                                                   C;C++
32                               C;C++;Java;Python;SQL;VBA
                               ...                        
63077              C++;HTML/CSS;Java;JavaScript;Python;SQL
63452    Bash/Shell/PowerShell;C#;C++;HTML/CSS;JavaScri...
63640    Bash/Shell/PowerShell;C;C#;C++;HTML/CSS;Java;J...
64330                                         C++;HTML/CSS
64867    Assembly;Bash/Shell/PowerShell;C;C#;C++;Dart;G...
Name: LanguageWorkedWith, Length: 13707, dtype: object

## Altering Rows / Columns

`Altering Column Headers`

In [150]:
#using for comprehensions
df.columns = [column.upper() for column in df.columns]

In [152]:
# df.columns = df.columns.str.replace('_','__')

`renaming selective columns using **dictionary**`

In [155]:
df.rename(columns={'AGE' : 'AGE_IN_YEAR','HOBBYIST':'IS_HOBBYIST'},inplace=True)

`Update RoW`

In [169]:
#single row 

print(df.loc[1,'AGE_IN_YEAR'])
df.loc[1,'AGE_IN_YEAR'] = 50
print(df.loc[1,'AGE_IN_YEAR'])

# OR

df.at[1,'AGE_IN_YEAR'] = 50

60.0
50.0


In [170]:
#multiple row 

df.loc[1,['AGE_IN_YEAR','COUNTRY']] = [60,'AFGANISTAN']




In [171]:
indianFilter = (df['COUNTRY'] == 'India')

In [188]:
df.loc[indiaFilter,'AGE_IN_YEAR']


Respondent
7         NaN
22        NaN
58        NaN
63       23.0
149      38.0
         ... 
54757     NaN
55407     NaN
62464     NaN
62954     NaN
64236     NaN
Name: AGE_IN_YEAR, Length: 8403, dtype: float64

In [187]:
# increasing AGE only for indian
df.loc[indiaFilter,'AGE_IN_YEAR'] = df.loc[indiaFilter,'AGE_IN_YEAR'] + 1