# Analysis of json data

In [1]:
students = {
    "firstName": ['sabita', 'bhawana', 'puja', 'sangita'],
    "lastName": ['rajbanshi', 'singh', 'oshin', 'magar'],
    "email": ['sabita@gmail.com', 'bhawana@email.com', 'puja@email.com', 'sangita@yahoo.co.uk']
    
}

In [2]:
import pandas as pd

In [3]:
df = pd.DataFrame(students)

In [4]:
df

Unnamed: 0,firstName,lastName,email
0,sabita,rajbanshi,sabita@gmail.com
1,bhawana,singh,bhawana@email.com
2,puja,oshin,puja@email.com
3,sangita,magar,sangita@yahoo.co.uk


In [5]:
df['lastName'] == 'rajbanshi'

0     True
1    False
2    False
3    False
Name: lastName, dtype: bool

In [6]:
#get the row with lastname=rajbanshi
filt = df['lastName'] == 'rajbanshi'
df[filt]

Unnamed: 0,firstName,lastName,email
0,sabita,rajbanshi,sabita@gmail.com


In [7]:
#.loc is used to lookup rows and columns by label 
#get the email with the lastname rajbanshi
df.loc[filt, 'email']

0    sabita@gmail.com
Name: email, dtype: object

In [8]:
#filter directly in the dataframe
df[df['lastName'] == 'rajbanshi']

Unnamed: 0,firstName,lastName,email
0,sabita,rajbanshi,sabita@gmail.com


# Analysis of stack overflow developer survey

In [9]:
df = pd.read_csv('data/survey_results_public.csv', index_col='Respondent')
schema_df = pd.read_csv('data/survey_results_schema.csv', index_col='Column')

In [10]:
df.head()

Unnamed: 0_level_0,MainBranch,Hobbyist,Age,Age1stCode,CompFreq,CompTotal,ConvertedComp,Country,CurrencyDesc,CurrencySymbol,...,SurveyEase,SurveyLength,Trans,UndergradMajor,WebframeDesireNextYear,WebframeWorkedWith,WelcomeChange,WorkWeekHrs,YearsCode,YearsCodePro
Respondent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,I am a developer by profession,Yes,,13,Monthly,,,Germany,European Euro,EUR,...,Neither easy nor difficult,Appropriate in length,No,"Computer science, computer engineering, or sof...",ASP.NET Core,ASP.NET;ASP.NET Core,Just as welcome now as I felt last year,50.0,36,27.0
2,I am a developer by profession,No,,19,,,,United Kingdom,Pound sterling,GBP,...,,,,"Computer science, computer engineering, or sof...",,,Somewhat more welcome now than last year,,7,4.0
3,I code primarily as a hobby,Yes,,15,,,,Russian Federation,,,...,Neither easy nor difficult,Appropriate in length,,,,,Somewhat more welcome now than last year,,4,
4,I am a developer by profession,Yes,25.0,18,,,,Albania,Albanian lek,ALL,...,,,No,"Computer science, computer engineering, or sof...",,,Somewhat less welcome now than last year,40.0,7,4.0
5,"I used to be a developer by profession, but no...",Yes,31.0,16,,,,United States,,,...,Easy,Too short,No,"Computer science, computer engineering, or sof...",Django;Ruby on Rails,Ruby on Rails,Just as welcome now as I felt last year,,15,8.0


In [11]:
schema_df.head()

Unnamed: 0_level_0,QuestionText
Column,Unnamed: 1_level_1
Respondent,Randomized respondent ID number (not in order ...
MainBranch,Which of the following options best describes ...
Hobbyist,Do you code as a hobby?
Age,What is your age (in years)? If you prefer not...
Age1stCode,At what age did you write your first line of c...


In [12]:
df.columns

Index(['MainBranch', 'Hobbyist', 'Age', 'Age1stCode', 'CompFreq', 'CompTotal',
       'ConvertedComp', 'Country', 'CurrencyDesc', 'CurrencySymbol',
       'DatabaseDesireNextYear', 'DatabaseWorkedWith', 'DevType', 'EdLevel',
       'Employment', 'Ethnicity', 'Gender', 'JobFactors', 'JobSat', 'JobSeek',
       'LanguageDesireNextYear', 'LanguageWorkedWith',
       'MiscTechDesireNextYear', 'MiscTechWorkedWith',
       'NEWCollabToolsDesireNextYear', 'NEWCollabToolsWorkedWith', 'NEWDevOps',
       'NEWDevOpsImpt', 'NEWEdImpt', 'NEWJobHunt', 'NEWJobHuntResearch',
       'NEWLearn', 'NEWOffTopic', 'NEWOnboardGood', 'NEWOtherComms',
       'NEWOvertime', 'NEWPurchaseResearch', 'NEWPurpleLink', 'NEWSOSites',
       'NEWStuck', 'OpSys', 'OrgSize', 'PlatformDesireNextYear',
       'PlatformWorkedWith', 'PurchaseWhat', 'Sexuality', 'SOAccount',
       'SOComm', 'SOPartFreq', 'SOVisitFreq', 'SurveyEase', 'SurveyLength',
       'Trans', 'UndergradMajor', 'WebframeDesireNextYear',
       'Webframe

In [13]:
schema_df.columns

Index(['QuestionText'], dtype='object')

In [14]:
#filter the high salary which is over 70000
high_salary = (df['ConvertedComp'] > 70000)

In [15]:
#apply this filter to the dataframe
df.loc[high_salary]

Unnamed: 0_level_0,MainBranch,Hobbyist,Age,Age1stCode,CompFreq,CompTotal,ConvertedComp,Country,CurrencyDesc,CurrencySymbol,...,SurveyEase,SurveyLength,Trans,UndergradMajor,WebframeDesireNextYear,WebframeWorkedWith,WelcomeChange,WorkWeekHrs,YearsCode,YearsCodePro
Respondent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,I am a developer by profession,Yes,36.0,12,Yearly,116000.0,116000.0,United States,United States dollar,USD,...,Easy,Appropriate in length,No,"Computer science, computer engineering, or sof...",Django;React.js;Vue.js,Flask,Just as welcome now as I felt last year,39.0,17,13
16,I am a developer by profession,Yes,45.0,8,Monthly,7000.0,108576.0,United Kingdom,Pound sterling,GBP,...,Easy,Appropriate in length,No,"Computer science, computer engineering, or sof...",,jQuery;React.js,Just as welcome now as I felt last year,50.0,37,23
17,I am a developer by profession,Yes,25.0,14,Yearly,79000.0,79000.0,United States,United States dollar,USD,...,Easy,Appropriate in length,No,"Computer science, computer engineering, or sof...",ASP.NET Core;Gatsby;React.js;Vue.js,ASP.NET;Gatsby;jQuery;React.js;Vue.js,Just as welcome now as I felt last year,40.0,7,3
18,I am a developer by profession,Yes,32.0,12,Monthly,105000.0,1260000.0,United States,United States dollar,USD,...,Neither easy nor difficult,Appropriate in length,No,"Computer science, computer engineering, or sof...",,,Just as welcome now as I felt last year,45.0,19,12
19,I am a developer by profession,No,24.0,15,Yearly,83400.0,83400.0,United States,United States dollar,USD,...,Easy,Appropriate in length,No,"Computer science, computer engineering, or sof...",Django;Express;React.js,Angular;Angular.js;ASP.NET Core,Just as welcome now as I felt last year,35.0,9,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65586,I am a developer by profession,Yes,,8,Yearly,225000.0,225000.0,United States,United States dollar,USD,...,,,,Mathematics or statistics,,,,50.0,25,15
65589,I am a developer by profession,Yes,,16,Yearly,150000.0,150000.0,United States,United States dollar,USD,...,,,,"Computer science, computer engineering, or sof...",,,,45.0,10,6
65602,"I am not primarily a developer, but I write co...",Yes,,13,Yearly,140000.0,140000.0,United States,United States dollar,USD,...,,,,"Another engineering discipline (such as civil,...",,,,,15,12
65604,I am a developer by profession,No,,13,Weekly,3000.0,150000.0,United States,United States dollar,USD,...,,,,"Computer science, computer engineering, or sof...",,,,45.0,7,4


In [16]:
#filter high_salary for 3 columns below 
df.loc[high_salary, ['Country', 'LanguageWorkedWith', 'ConvertedComp']]

Unnamed: 0_level_0,Country,LanguageWorkedWith,ConvertedComp
Respondent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8,United States,Python;SQL,116000.0
16,United Kingdom,Bash/Shell/PowerShell;HTML/CSS;Java;JavaScript...,108576.0
17,United States,C#;HTML/CSS;JavaScript;Python;SQL;VBA,79000.0
18,United States,Bash/Shell/PowerShell;HTML/CSS;Perl,1260000.0
19,United States,Bash/Shell/PowerShell;C#;HTML/CSS;JavaScript;S...,83400.0
...,...,...,...
65586,United States,,225000.0
65589,United States,,150000.0
65602,United States,,140000.0
65604,United States,,150000.0


In [17]:
#filter Country from list of countries using .isin() method
countries = ['United States', 'India', 'United Kingdom', 'Germany', 'Canada']
filt_countries = df['Country'].isin(countries)

In [18]:
#apply above filter to the dataframe
df.loc[filt_countries, 'Country']

Respondent
1               Germany
2        United Kingdom
5         United States
6               Germany
7                 India
              ...      
62834            Canada
62954             India
63077     United States
64236             India
64858     United States
Name: Country, Length: 30849, dtype: object

In [19]:
df['LanguageWorkedWith']

Respondent
1                                   C#;HTML/CSS;JavaScript
2                                         JavaScript;Swift
3                                 Objective-C;Python;Swift
4                                                      NaN
5                                        HTML/CSS;Ruby;SQL
                               ...                        
64858                                                  NaN
64867    Assembly;Bash/Shell/PowerShell;C;C#;C++;Dart;G...
64898                                                  NaN
64925                                             HTML/CSS
65112                      C#;HTML/CSS;Java;JavaScript;SQL
Name: LanguageWorkedWith, Length: 64461, dtype: object

In [20]:
#filter for programming language python within this string of "LanguageWorkedWith" column using .str method
filt_lang = df['LanguageWorkedWith'].str.contains('Python', na=False)

In [21]:
filt_lang

Respondent
1        False
2        False
3         True
4        False
5        False
         ...  
64858    False
64867     True
64898    False
64925    False
65112    False
Name: LanguageWorkedWith, Length: 64461, dtype: bool

In [22]:
# .loc filters only true value
df.loc[filt_lang]

Unnamed: 0_level_0,MainBranch,Hobbyist,Age,Age1stCode,CompFreq,CompTotal,ConvertedComp,Country,CurrencyDesc,CurrencySymbol,...,SurveyEase,SurveyLength,Trans,UndergradMajor,WebframeDesireNextYear,WebframeWorkedWith,WelcomeChange,WorkWeekHrs,YearsCode,YearsCodePro
Respondent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,I code primarily as a hobby,Yes,,15,,,,Russian Federation,,,...,Neither easy nor difficult,Appropriate in length,,,,,Somewhat more welcome now than last year,,4,
8,I am a developer by profession,Yes,36.0,12,Yearly,116000.0,116000.0,United States,United States dollar,USD,...,Easy,Appropriate in length,No,"Computer science, computer engineering, or sof...",Django;React.js;Vue.js,Flask,Just as welcome now as I felt last year,39.0,17,13
10,I am a developer by profession,Yes,22.0,14,Yearly,25000.0,32315.0,United Kingdom,Pound sterling,GBP,...,Easy,Appropriate in length,No,Mathematics or statistics,Flask;jQuery,Flask;jQuery,Somewhat more welcome now than last year,36.0,8,4
13,"I am not primarily a developer, but I write co...",Yes,53.0,14,Monthly,3000.0,38916.0,Netherlands,European Euro,EUR,...,Neither easy nor difficult,Too long,No,,,,A lot less welcome now than last year,36.0,35,20
15,I am a student who is learning to code,Yes,,13,,,,France,,,...,Easy,Appropriate in length,No,"Computer science, computer engineering, or sof...",,,Just as welcome now as I felt last year,,4,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61561,,Yes,64.0,18,,,,France,,,...,Neither easy nor difficult,Appropriate in length,No,"Information systems, information technology, o...",Angular.js;jQuery;React.js;Vue.js,jQuery,A lot more welcome now than last year,,30,
62391,,Yes,,Younger than 5 years,,,,Morocco,,,...,Neither easy nor difficult,Too short,,,Angular.js;Express;React.js;Ruby on Rails,Angular.js;Express;React.js;Ruby on Rails,,,Less than 1 year,Less than 1 year
63077,,Yes,,20,,,,United States,,,...,,,,"Computer science, computer engineering, or sof...",,,,,4,
63640,,Yes,,8,,,,Australia,,,...,,,No,,Angular;Angular.js;Express;jQuery;React.js;Vue.js,Express;jQuery,Somewhat more welcome now than last year,,6,2


In [23]:
df.loc[filt_lang, 'LanguageWorkedWith']

Respondent
3                                 Objective-C;Python;Swift
8                                               Python;SQL
10                     HTML/CSS;Java;JavaScript;Python;SQL
13                                     C;JavaScript;Python
15        Bash/Shell/PowerShell;C;HTML/CSS;Java;Python;SQL
                               ...                        
61561    Bash/Shell/PowerShell;HTML/CSS;JavaScript;Perl...
62391       C++;HTML/CSS;JavaScript;Python;Ruby;TypeScript
63077              C++;HTML/CSS;Java;JavaScript;Python;SQL
63640    Bash/Shell/PowerShell;C;C#;C++;HTML/CSS;Java;J...
64867    Assembly;Bash/Shell/PowerShell;C;C#;C++;Dart;G...
Name: LanguageWorkedWith, Length: 25287, dtype: object