In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv(r"C:\Users\Ali\Documents\python\stack-overflow-developer-survey-2024\survey_results_public.csv")
schema = pd.read_csv(r"C:\Users\Ali\Documents\python\stack-overflow-developer-survey-2024\survey_results_schema.csv")

In [3]:
df.head()

Unnamed: 0,ResponseId,MainBranch,Age,Employment,RemoteWork,Check,CodingActivities,EdLevel,LearnCode,LearnCodeOnline,...,JobSatPoints_6,JobSatPoints_7,JobSatPoints_8,JobSatPoints_9,JobSatPoints_10,JobSatPoints_11,SurveyLength,SurveyEase,ConvertedCompYearly,JobSat
0,1,I am a developer by profession,Under 18 years old,"Employed, full-time",Remote,Apples,Hobby,Primary/elementary school,Books / Physical media,,...,,,,,,,,,,
1,2,I am a developer by profession,35-44 years old,"Employed, full-time",Remote,Apples,Hobby;Contribute to open-source projects;Other...,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",Books / Physical media;Colleague;On the job tr...,Technical documentation;Blogs;Books;Written Tu...,...,0.0,0.0,0.0,0.0,0.0,0.0,,,,
2,3,I am a developer by profession,45-54 years old,"Employed, full-time",Remote,Apples,Hobby;Contribute to open-source projects;Other...,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",Books / Physical media;Colleague;On the job tr...,Technical documentation;Blogs;Books;Written Tu...,...,,,,,,,Appropriate in length,Easy,,
3,4,I am learning to code,18-24 years old,"Student, full-time",,Apples,,Some college/university study without earning ...,"Other online resources (e.g., videos, blogs, f...",Stack Overflow;How-to videos;Interactive tutorial,...,,,,,,,Too long,Easy,,
4,5,I am a developer by profession,18-24 years old,"Student, full-time",,Apples,,"Secondary school (e.g. American high school, G...","Other online resources (e.g., videos, blogs, f...",Technical documentation;Blogs;Written Tutorial...,...,,,,,,,Too short,Easy,,


In [4]:
schema.head()

Unnamed: 0,qid,qname,question,force_resp,type,selector
0,QID2,MainBranch,Which of the following options best describes ...,True,MC,SAVR
1,QID127,Age,What is your age?*,True,MC,SAVR
2,QID296,Employment,Which of the following best describes your cur...,True,MC,MAVR
3,QID308,RemoteWork,Which best describes your current work situation?,False,MC,SAVR
4,QID341,Check,Just checking to make sure you are paying atte...,True,MC,SAVR


In [5]:
df.shape

(65437, 114)

In [6]:
schema.shape

(87, 6)

In [7]:
df.columns

Index(['ResponseId', 'MainBranch', 'Age', 'Employment', 'RemoteWork', 'Check',
       'CodingActivities', 'EdLevel', 'LearnCode', 'LearnCodeOnline',
       ...
       'JobSatPoints_6', 'JobSatPoints_7', 'JobSatPoints_8', 'JobSatPoints_9',
       'JobSatPoints_10', 'JobSatPoints_11', 'SurveyLength', 'SurveyEase',
       'ConvertedCompYearly', 'JobSat'],
      dtype='object', length=114)

In [8]:
schema.columns

Index(['qid', 'qname', 'question', 'force_resp', 'type', 'selector'], dtype='object')

### 1-How many respondents completed the survey?

In [10]:
df.ResponseId.nunique()

65437

###2-How many respondents answered all questions?

In [12]:
questions = set(schema.qname.unique()) & set(df.columns)
df.dropna(subset=questions).shape[0]

6306

###3-What are the values ​​of measures of central tendency for respondents' experience (WorkExp)?

In [14]:
print(f'Mean: {round(df.WorkExp.mean(), 1)}')
print(f'Mode: {round(df.WorkExp.mode()[0], 1)}')
print(f'Median: {round(df.WorkExp.median(), 1)}')

Mean: 11.5
Mode: 3.0
Median: 9.0


### 4-How many respondents work remotely?

In [16]:
df[df.RemoteWork == 'Remote'].shape[0]

20831

### 5-What percentage of respondents program in Python?

In [18]:
# First option
df['worked_with_python'] = df.LanguageHaveWorkedWith.apply(lambda x: 1 if 'python' in str(x).lower() else 0)
round(df.worked_with_python.sum() / df.ResponseId.nunique(), 2)

0.47

In [19]:
# Second option
df['worked_with_python'] = df.LanguageHaveWorkedWith.str.contains('python', case=False, na=False)
round(df.worked_with_python.sum() / df.ResponseId.nunique(), 2)

0.47

### 6-How many respondents learned to program using online courses?

In [21]:
# First option
df['learned_with_online_courses'] = df.LearnCode.apply(lambda x: 1 if 'online courses' in str(x).lower() else 0)
df.learned_with_online_courses.sum()

30271

In [22]:
# Second option
df['learned_with_online_courses'] = df.LearnCode.str.contains('online courses', case=False, na=False)
df.learned_with_online_courses.sum()

30271

### 7-Among the respondents who program in Python, grouped by country, what is the average and median amount of compensation (ConvertedCompYearly) in each country?

In [24]:
py_progers_by_country = df[df.worked_with_python == 1].dropna(subset='ConvertedCompYearly') \
                                                      .groupby('Country', as_index=False) \
                                                      .agg({'ConvertedCompYearly': ['mean', 'median']}).droplevel(1, axis=1)
py_progers_by_country.columns = ['country', 'mean_comp', 'median_comp']
py_progers_by_country = py_progers_by_country.astype({'mean_comp': 'int', 'median_comp': 'int'})
py_progers_by_country

Unnamed: 0,country,mean_comp,median_comp
0,Afghanistan,4543,4768
1,Albania,56295,56295
2,Algeria,9053,6230
3,Andorra,193331,193331
4,Angola,6,6
...,...,...,...
146,"Venezuela, Bolivarian Republic of...",21500,7100
147,Viet Nam,14014,10180
148,Yemen,10297,5333
149,Zambia,28123,22803


### 8-What are the education levels of the 5 respondents with the highest compensation?

In [26]:
df[['ResponseId', 'EdLevel', 'ConvertedCompYearly']].sort_values(by='ConvertedCompYearly', ascending=False).reset_index(drop=True).head(5)

Unnamed: 0,ResponseId,EdLevel,ConvertedCompYearly
0,15838,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",16256603.0
1,12724,"Professional degree (JD, MD, Ph.D, Ed.D, etc.)",13818022.0
2,28380,"Professional degree (JD, MD, Ph.D, Ed.D, etc.)",9000000.0
3,17594,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",6340564.0
4,17673,"Professional degree (JD, MD, Ph.D, Ed.D, etc.)",4936778.0


### bonus1-In each age category, what percentage of respondents program in Python?

In [28]:
py_progers_share_by_age = df.groupby('Age', as_index=False).agg({'ResponseId': 'count', 'worked_with_python': 'sum'})
py_progers_share_by_age['work_with_python_share'] = round(py_progers_share_by_age.worked_with_python / py_progers_share_by_age.ResponseId, 2)
py_progers_share_by_age

Unnamed: 0,Age,ResponseId,worked_with_python,work_with_python_share
0,18-24 years old,14098,7884,0.56
1,25-34 years old,23911,10945,0.46
2,35-44 years old,14942,6204,0.42
3,45-54 years old,6249,2619,0.42
4,55-64 years old,2575,1041,0.4
5,65 years or older,772,290,0.38
6,Prefer not to say,322,146,0.45
7,Under 18 years old,2568,1666,0.65


### bonus2-Among respondents who are in the 75th percentile for average compensation and who work remotely, which industries are the most common?

In [30]:
df[(df.ConvertedCompYearly > df.ConvertedCompYearly.quantile(0.75)) & (df.RemoteWork == 'Remote')].Industry.value_counts().reset_index()

Unnamed: 0,Industry,count
0,Software Development,768
1,Other:,239
2,Healthcare,156
3,Fintech,156
4,"Internet, Telecomm or Information Services",145
5,Retail and Consumer Services,106
6,Media & Advertising Services,103
7,Banking/Financial Services,69
8,Government,69
9,Computer Systems Design and Services,69
