<h2>Working on StackOverflow data survey</h2>

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('survey_results_public.csv')

In [3]:
skema_df = pd.read_csv('survey_results_schema.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64461 entries, 0 to 64460
Data columns (total 61 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Respondent                    64461 non-null  int64  
 1   MainBranch                    64162 non-null  object 
 2   Hobbyist                      64416 non-null  object 
 3   Age                           45446 non-null  float64
 4   Age1stCode                    57900 non-null  object 
 5   CompFreq                      40069 non-null  object 
 6   CompTotal                     34826 non-null  float64
 7   ConvertedComp                 34756 non-null  float64
 8   Country                       64072 non-null  object 
 9   CurrencyDesc                  45472 non-null  object 
 10  CurrencySymbol                45472 non-null  object 
 11  DatabaseDesireNextYear        44070 non-null  object 
 12  DatabaseWorkedWith            49537 non-null  object 
 13  D

Aggregation and Grouping

In [5]:
# find mean, mode and median of the salaries of the programmers
print(f'median of salaries = ', df['ConvertedComp'].median())
print(f'mode of salaries = \n', df['ConvertedComp'].mode())
print(f'mean of salaries = ', df['ConvertedComp'].mean())

median of salaries =  54049.0
mode of salaries = 
 0    120000.0
dtype: float64
mean of salaries =  103756.05014961446


In [6]:
# find mean of all the numerical columns
df.mean()

Respondent        3.255408e+04
Age               3.083411e+01
CompTotal        3.190464e+242
ConvertedComp     1.037561e+05
WorkWeekHrs       4.078217e+01
dtype: float64

In [7]:
# get a broad overview of the numerical data
df.describe()

Unnamed: 0,Respondent,Age,CompTotal,ConvertedComp,WorkWeekHrs
count,64461.0,45446.0,34826.0,34756.0,41151.0
mean,32554.079738,30.834111,3.190464e+242,103756.1,40.782174
std,18967.44236,9.585392,inf,226885.3,17.816383
min,1.0,1.0,0.0,0.0,1.0
25%,16116.0,24.0,20000.0,24648.0,40.0
50%,32231.0,29.0,63000.0,54049.0,40.0
75%,49142.0,35.0,125000.0,95000.0,44.0
max,65639.0,279.0,1.1111110000000001e+247,2000000.0,475.0


In [8]:
#count the number of values not NaNs
df['ConvertedComp'].count()

34756

In [9]:
# count the occurance of each value in column
df['Hobbyist'].value_counts()

Yes    50388
No     14028
Name: Hobbyist, dtype: int64

In [10]:
df['Age'].value_counts()

25.0    2693
28.0    2412
30.0    2406
26.0    2391
27.0    2338
        ... 
26.8       1
3.0        1
14.7       1
26.5       1
22.5       1
Name: Age, Length: 110, dtype: int64

In [11]:
df['Gender'].value_counts()

Man                                                            46013
Woman                                                           3844
Non-binary, genderqueer, or gender non-conforming                385
Man;Non-binary, genderqueer, or gender non-conforming            121
Woman;Non-binary, genderqueer, or gender non-conforming           92
Woman;Man                                                         76
Woman;Man;Non-binary, genderqueer, or gender non-conforming       26
Name: Gender, dtype: int64

In [12]:
# we can also get percentage values by setting normalize parameter
# we can multiply this whole expression by 100 to get percentage
df['Gender'].value_counts(normalize=True)

Man                                                            0.910121
Woman                                                          0.076033
Non-binary, genderqueer, or gender non-conforming              0.007615
Man;Non-binary, genderqueer, or gender non-conforming          0.002393
Woman;Non-binary, genderqueer, or gender non-conforming        0.001820
Woman;Man                                                      0.001503
Woman;Man;Non-binary, genderqueer, or gender non-conforming    0.000514
Name: Gender, dtype: float64

In [13]:
df['Country'].value_counts()

United States       12469
India                8403
United Kingdom       3896
Germany              3890
Canada               2191
                    ...  
Grenada                 1
Saint Lucia             1
Marshall Islands        1
Mali                    1
North Korea             1
Name: Country, Length: 183, dtype: int64

<h3> Grouping </h3>

In [14]:
# grouping the rows by country
# creates multiple dataframes, each dataframe containing data related to a specific country 
country_group = df.groupby('Country')

In [15]:
country_group.get_group('India')

Unnamed: 0,Respondent,MainBranch,Hobbyist,Age,Age1stCode,CompFreq,CompTotal,ConvertedComp,Country,CurrencyDesc,...,SurveyEase,SurveyLength,Trans,UndergradMajor,WebframeDesireNextYear,WebframeWorkedWith,WelcomeChange,WorkWeekHrs,YearsCode,YearsCodePro
6,7,I am a developer by profession,Yes,,18,Monthly,,,India,United States dollar,...,,,,"Computer science, computer engineering, or sof...",,,A lot more welcome now than last year,,6,4
21,22,I am a developer by profession,Yes,,18,Monthly,,,India,Indian rupee,...,Easy,Appropriate in length,,"Computer science, computer engineering, or sof...",,,Just as welcome now as I felt last year,50.0,10,2
57,58,I am a developer by profession,Yes,,22,,,,India,Indian rupee,...,Neither easy nor difficult,Too long,,Web development or web design,Angular;Angular.js;ASP.NET;ASP.NET Core;jQuery,Angular;Angular.js;ASP.NET;ASP.NET Core;jQuery,,,,
62,63,I am a student who is learning to code,Yes,21.0,17,,,,India,,...,Easy,Appropriate in length,No,,Angular.js;Django;jQuery;Laravel;Vue.js,,Not applicable - I did not use Stack Overflow ...,,4,
147,149,I am a developer by profession,Yes,36.0,31,Yearly,21000000.0,293196.0,India,Indian rupee,...,Neither easy nor difficult,Appropriate in length,No,"Computer science, computer engineering, or sof...",,,Somewhat more welcome now than last year,70.0,5,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64391,54757,,Yes,,18,,,,India,,...,Easy,Appropriate in length,No,"Computer science, computer engineering, or sof...",jQuery,jQuery,A lot less welcome now than last year,,5,Less than 1 year
64398,55407,,Yes,,,,,,India,,...,,,,,,,,,,
64439,62464,,Yes,,,,,,India,,...,,,,I never declared a major,,,,,,
64442,62954,,Yes,,,,,,India,,...,,,,,,ASP.NET;ASP.NET Core;Django;jQuery;Symfony;Vue.js,,,,


In [16]:
len(country_group)

183

In [17]:
# filter equiavalent query of grouping
filt = df['Country'] == 'India'
df.loc[filt]

Unnamed: 0,Respondent,MainBranch,Hobbyist,Age,Age1stCode,CompFreq,CompTotal,ConvertedComp,Country,CurrencyDesc,...,SurveyEase,SurveyLength,Trans,UndergradMajor,WebframeDesireNextYear,WebframeWorkedWith,WelcomeChange,WorkWeekHrs,YearsCode,YearsCodePro
6,7,I am a developer by profession,Yes,,18,Monthly,,,India,United States dollar,...,,,,"Computer science, computer engineering, or sof...",,,A lot more welcome now than last year,,6,4
21,22,I am a developer by profession,Yes,,18,Monthly,,,India,Indian rupee,...,Easy,Appropriate in length,,"Computer science, computer engineering, or sof...",,,Just as welcome now as I felt last year,50.0,10,2
57,58,I am a developer by profession,Yes,,22,,,,India,Indian rupee,...,Neither easy nor difficult,Too long,,Web development or web design,Angular;Angular.js;ASP.NET;ASP.NET Core;jQuery,Angular;Angular.js;ASP.NET;ASP.NET Core;jQuery,,,,
62,63,I am a student who is learning to code,Yes,21.0,17,,,,India,,...,Easy,Appropriate in length,No,,Angular.js;Django;jQuery;Laravel;Vue.js,,Not applicable - I did not use Stack Overflow ...,,4,
147,149,I am a developer by profession,Yes,36.0,31,Yearly,21000000.0,293196.0,India,Indian rupee,...,Neither easy nor difficult,Appropriate in length,No,"Computer science, computer engineering, or sof...",,,Somewhat more welcome now than last year,70.0,5,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64391,54757,,Yes,,18,,,,India,,...,Easy,Appropriate in length,No,"Computer science, computer engineering, or sof...",jQuery,jQuery,A lot less welcome now than last year,,5,Less than 1 year
64398,55407,,Yes,,,,,,India,,...,,,,,,,,,,
64439,62464,,Yes,,,,,,India,,...,,,,I never declared a major,,,,,,
64442,62954,,Yes,,,,,,India,,...,,,,,,ASP.NET;ASP.NET Core;Django;jQuery;Symfony;Vue.js,,,,


In [18]:
# aggregating data by country
# age value counts in India
country_group.get_group('India')['Age'].value_counts()

23.0    424
25.0    407
24.0    380
26.0    341
22.0    339
       ... 
11.0      1
51.0      1
73.0      1
7.0       1
3.0       1
Name: Age, Length: 61, dtype: int64

In [19]:
# average salary in india
country_group.get_group('India')['ConvertedComp'].median()

10056.0

In [20]:
# average salary in united states
country_group.get_group('United States')['ConvertedComp'].median()

115000.0

In [21]:
# we could find the average salary of all countries at once using this group object

country_group['ConvertedComp'].median().sort_values(ascending=False)

Country
United States      115000.0
Switzerland         98599.0
Israel              98064.0
Nomadic             93000.0
Andorra             88640.0
                     ...   
Saint Lucia             NaN
Sierra Leone            NaN
Solomon Islands         NaN
Tajikistan              NaN
Timor-Leste             NaN
Name: ConvertedComp, Length: 183, dtype: float64

In [22]:
# or the median age of all countries
country_group['Age'].median().sort_values()

Country
Maldives                              17.0
Belize                                19.0
Libyan Arab Jamahiriya                21.0
Nauru                                 21.0
Myanmar                               23.0
                                      ... 
Liechtenstein                          NaN
Mali                                   NaN
Marshall Islands                       NaN
Micronesia, Federated States of...     NaN
North Korea                            NaN
Name: Age, Length: 183, dtype: float64

In [23]:
# we can also get value counts
# here we have series with multiple indexes
country_group['YearsCode'].value_counts()

Country      YearsCode       
Afghanistan  4                   7
             5                   7
             3                   6
             2                   4
             6                   4
                                ..
Zimbabwe     18                  1
             2                   1
             4                   1
             9                   1
             Less than 1 year    1
Name: YearsCode, Length: 3507, dtype: int64

In [24]:
# we can grab the data for China only
country_group['YearsCode'].value_counts().loc['China'].sort_values()

YearsCode
46                   1
30                   1
29                   1
28                   1
22                   1
38                   1
24                   2
25                   4
18                   4
13                   5
Less than 1 year     6
17                   6
16                   6
15                   7
20                   9
1                    9
14                  11
12                  11
9                   13
8                   13
11                  14
6                   15
10                  17
2                   21
4                   22
3                   23
7                   26
5                   31
Name: YearsCode, dtype: int64

In [25]:
# We can apply many functions on the group object by using agg method
country_group['ConvertedComp'].agg(['median', 'mean', 'min', 'max'])

Unnamed: 0_level_0,median,mean,min,max
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Afghanistan,15163.5,148290.125000,0.0,1000000.0
Albania,15900.0,25611.000000,0.0,194580.0
Algeria,9432.0,10362.812500,300.0,36000.0
Andorra,88640.0,88640.000000,35672.0,141608.0
Angola,5292.0,5292.000000,4848.0,5736.0
...,...,...,...,...
"Venezuela, Bolivarian Republic of...",3600.0,6280.611111,96.0,24000.0
Viet Nam,10344.0,28342.605769,0.0,1000000.0
Yemen,36000.0,36000.000000,36000.0,36000.0
Zambia,5452.0,17506.400000,816.0,60000.0


In [26]:
# people that know python
python_filt = country_group.get_group('India')['LanguageWorkedWith'].str.contains('Python', na=False)
country_group.get_group('India').loc[python_filt]

Unnamed: 0,Respondent,MainBranch,Hobbyist,Age,Age1stCode,CompFreq,CompTotal,ConvertedComp,Country,CurrencyDesc,...,SurveyEase,SurveyLength,Trans,UndergradMajor,WebframeDesireNextYear,WebframeWorkedWith,WelcomeChange,WorkWeekHrs,YearsCode,YearsCodePro
21,22,I am a developer by profession,Yes,,18,Monthly,,,India,Indian rupee,...,Easy,Appropriate in length,,"Computer science, computer engineering, or sof...",,,Just as welcome now as I felt last year,50.0,10,2
62,63,I am a student who is learning to code,Yes,21.0,17,,,,India,,...,Easy,Appropriate in length,No,,Angular.js;Django;jQuery;Laravel;Vue.js,,Not applicable - I did not use Stack Overflow ...,,4,
225,227,I am a developer by profession,Yes,24.0,19,Yearly,840000.0,11728.0,India,Indian rupee,...,Easy,Appropriate in length,No,"Computer science, computer engineering, or sof...",Gatsby,Flask;Spring,Somewhat more welcome now than last year,45.0,5,2
230,232,I am a student who is learning to code,No,20.0,12,,,,India,,...,Neither easy nor difficult,Appropriate in length,No,"Computer science, computer engineering, or sof...",Django;jQuery;Laravel,Flask,Just as welcome now as I felt last year,,7,
232,234,I am a developer by profession,Yes,22.0,16,,,,India,,...,Easy,Appropriate in length,No,"Computer science, computer engineering, or sof...",Django;Flask,Angular;Angular.js;Django;Flask;jQuery,Just as welcome now as I felt last year,,4,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64280,32165,,Yes,20.0,12,,,,India,,...,Easy,Appropriate in length,No,"Computer science, computer engineering, or sof...",ASP.NET Core;Django,ASP.NET Core;Django;Drupal;jQuery,Just as welcome now as I felt last year,,8,
64303,37383,,Yes,,,,,,India,,...,,,,,Django,,,,,
64337,42965,,Yes,,,,,,India,,...,,,,,Django;Flask,,A lot more welcome now than last year,,,
64381,52184,,Yes,20.0,17,,,,India,,...,Neither easy nor difficult,Appropriate in length,No,,Angular;Angular.js;jQuery;Ruby on Rails;Vue.js,Django;Flask;React.js,Just as welcome now as I felt last year,,3,


In [27]:
# number of developers who knows python in each country
country_group['LanguageWorkedWith'].apply(lambda x: x.str.contains('Python').sum())

Country
Afghanistan                              11
Albania                                  13
Algeria                                  40
Andorra                                   3
Angola                                    1
                                       ... 
Venezuela, Bolivarian Republic of...     29
Viet Nam                                102
Yemen                                     1
Zambia                                    4
Zimbabwe                                 13
Name: LanguageWorkedWith, Length: 183, dtype: int64

In [28]:
country_group.apply(lambda x: len(x)).sort_values()

Country
Lesotho                                   1
Mali                                      1
Gabon                                     1
Micronesia, Federated States of...        1
Fiji                                      1
                                      ...  
Canada                                 2191
Germany                                3890
United Kingdom                         3896
India                                  8403
United States                         12469
Length: 183, dtype: int64

In [29]:
# percentage of developers who knows python
py_devs = country_group.get_group('United States')['LanguageWorkedWith'].str.contains('Python').sum()
total_devs = len(country_group.get_group('United States'))
print(f"Py devs = {py_devs}\nTotal devs = {total_devs}")

Py devs = 5964
Total devs = 12469


In [30]:
# country with highest py devs percentage
py_devs_by_country = country_group['LanguageWorkedWith'].apply(lambda x: x.str.contains('Python').sum())
total_devs_by_country = country_group.apply(lambda x: len(x))

In [31]:
py_devs_by_country

Country
Afghanistan                              11
Albania                                  13
Algeria                                  40
Andorra                                   3
Angola                                    1
                                       ... 
Venezuela, Bolivarian Republic of...     29
Viet Nam                                102
Yemen                                     1
Zambia                                    4
Zimbabwe                                 13
Name: LanguageWorkedWith, Length: 183, dtype: int64

In [32]:
total_devs_by_country

Country
Afghanistan                              84
Albania                                  54
Algeria                                  94
Andorra                                  13
Angola                                    9
                                       ... 
Venezuela, Bolivarian Republic of...     70
Viet Nam                                364
Yemen                                     7
Zambia                                   21
Zimbabwe                                 31
Length: 183, dtype: int64

<h4> Find Percentage of Python Developers in each country </h4>

In [33]:
# create two columns by applying two aggregate functions
# first function finds the total number of python developers by country
# second function finds the total number of developers by country
py_dev_stats = country_group['LanguageWorkedWith'].agg([lambda x: x.str.contains('Python').sum(), lambda x: len(x)])

In [34]:
# change the name of columns 
py_dev_stats.rename(columns={
    '<lambda_0>': 'py_devs',
    '<lambda_1>': 'total_devs'
}, inplace=True)

In [35]:
# create new column to calculate percentage of py devs
py_dev_stats['percentage'] = (py_dev_stats['py_devs'] / py_dev_stats['total_devs']) * 100

In [36]:
# sort by percentage
py_dev_stats.sort_values(['percentage'])

Unnamed: 0_level_0,py_devs,total_devs,percentage
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Lesotho,0,1,0.000000
Cape Verde,0,2,0.000000
Chad,0,1,0.000000
Djibouti,0,2,0.000000
Fiji,0,1,0.000000
...,...,...,...
Brunei Darussalam,2,3,66.666667
Montenegro,9,13,69.230769
"Micronesia, Federated States of...",1,1,100.000000
Saint Lucia,1,1,100.000000


In [37]:
country_group

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f4b366e0340>

<h1>Cleaning data</h1>

In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64461 entries, 0 to 64460
Data columns (total 61 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Respondent                    64461 non-null  int64  
 1   MainBranch                    64162 non-null  object 
 2   Hobbyist                      64416 non-null  object 
 3   Age                           45446 non-null  float64
 4   Age1stCode                    57900 non-null  object 
 5   CompFreq                      40069 non-null  object 
 6   CompTotal                     34826 non-null  float64
 7   ConvertedComp                 34756 non-null  float64
 8   Country                       64072 non-null  object 
 9   CurrencyDesc                  45472 non-null  object 
 10  CurrencySymbol                45472 non-null  object 
 11  DatabaseDesireNextYear        44070 non-null  object 
 12  DatabaseWorkedWith            49537 non-null  object 
 13  D

In [64]:
import numpy as np
# number of NaN values
len(df) - df['Age'].count()

19015

In [73]:
newdf = pd.DataFrame({
    'fname': ['Abdul', 'Bryan', 'Tony', 'Stephen', 'NA', None],
    'lname': ['Manan', 'Singer', 'Stark', None, 'Rogers', np.nan],
    'email': ['sammanan4@gmail.com', 'bryan@gmail.com', 'ts@stark.com', 'strange@magic.uk', np.nan, None],
    'age': [24, np.nan, 'NA', None, 32, None],
})

In [74]:
newdf

Unnamed: 0,fname,lname,email,age
0,Abdul,Manan,sammanan4@gmail.com,24.0
1,Bryan,Singer,bryan@gmail.com,
2,Tony,Stark,ts@stark.com,
3,Stephen,,strange@magic.uk,
4,,Rogers,,32.0
5,,,,


In [75]:
# drops all rows will any value equal to none or NaN
newdf.dropna()

Unnamed: 0,fname,lname,email,age
0,Abdul,Manan,sammanan4@gmail.com,24.0
2,Tony,Stark,ts@stark.com,


In [79]:
newdf.dropna(axis='index', how='all')

Unnamed: 0,fname,lname,email,age
0,Abdul,Manan,sammanan4@gmail.com,24.0
1,Bryan,Singer,bryan@gmail.com,
2,Tony,Stark,ts@stark.com,
3,Stephen,,strange@magic.uk,
4,,Rogers,,32.0


In [81]:
# check only lname and age columns for NA values
# if any of lname or age is NaN then drop
newdf.dropna(axis='rows', how='any', subset=["lname", "age"])

Unnamed: 0,fname,lname,email,age
0,Abdul,Manan,sammanan4@gmail.com,24.0
2,Tony,Stark,ts@stark.com,
4,,Rogers,,32.0


In [83]:
# if both lname and age are NaN then drop
newdf.dropna(axis='rows', how='all', subset=["lname", "age"])

Unnamed: 0,fname,lname,email,age
0,Abdul,Manan,sammanan4@gmail.com,24.0
1,Bryan,Singer,bryan@gmail.com,
2,Tony,Stark,ts@stark.com,
4,,Rogers,,32.0


In [82]:
# check only 0th and 4th row
newdf.dropna(axis='columns', how='any', subset=[0, 4])

Unnamed: 0,fname,lname,age
0,Abdul,Manan,24.0
1,Bryan,Singer,
2,Tony,Stark,
3,Stephen,,
4,,Rogers,32.0
5,,,


In [89]:
# we can replace NA or missing values by NaN beforehand
newdf.replace('NA', np.nan)

Unnamed: 0,fname,lname,email,age
0,Abdul,Manan,sammanan4@gmail.com,24.0
1,Bryan,Singer,bryan@gmail.com,
2,Tony,Stark,ts@stark.com,
3,Stephen,,strange@magic.uk,
4,,Rogers,,32.0
5,,,,


In [99]:
# check which values are NaN 
newdf.isna()

Unnamed: 0,fname,lname,email,age
0,False,False,False,False
1,False,False,False,True
2,False,False,False,False
3,False,True,False,True
4,False,False,True,False
5,True,True,True,True


In [100]:
# another method to replace NaN values
newdf.fillna('Missing')

Unnamed: 0,fname,lname,email,age
0,Abdul,Manan,sammanan4@gmail.com,24
1,Bryan,Singer,bryan@gmail.com,Missing
2,Tony,Stark,ts@stark.com,
3,Stephen,Missing,strange@magic.uk,Missing
4,,Rogers,Missing,32
5,Missing,Missing,Missing,Missing


In [101]:
newdf.dtypes

fname    object
lname    object
email    object
age      object
dtype: object

In [107]:
# newdf['age'].mean() # does not work because values are strings not ints
# newdf['age'].astype(int) # does not work either because cannot convert NaN to int
int(np.nan)

ValueError: cannot convert float NaN to integer

In [112]:
# since type of NaN is float, therefore we can cast the column as float and use math ops
newdf2 = newdf.replace('NA', np.nan)
newdf2['age'] = newdf2['age'].astype(float)
newdf2['age'].mean()

28.0

In [113]:
newdf2.dtypes

fname     object
lname     object
email     object
age      float64
dtype: object

In [114]:
df.columns

Index(['Respondent', 'MainBranch', 'Hobbyist', 'Age', 'Age1stCode', 'CompFreq',
       'CompTotal', 'ConvertedComp', 'Country', 'CurrencyDesc',
       'CurrencySymbol', 'DatabaseDesireNextYear', 'DatabaseWorkedWith',
       'DevType', 'EdLevel', 'Employment', 'Ethnicity', 'Gender', 'JobFactors',
       'JobSat', 'JobSeek', 'LanguageDesireNextYear', 'LanguageWorkedWith',
       'MiscTechDesireNextYear', 'MiscTechWorkedWith',
       'NEWCollabToolsDesireNextYear', 'NEWCollabToolsWorkedWith', 'NEWDevOps',
       'NEWDevOpsImpt', 'NEWEdImpt', 'NEWJobHunt', 'NEWJobHuntResearch',
       'NEWLearn', 'NEWOffTopic', 'NEWOnboardGood', 'NEWOtherComms',
       'NEWOvertime', 'NEWPurchaseResearch', 'NEWPurpleLink', 'NEWSOSites',
       'NEWStuck', 'OpSys', 'OrgSize', 'PlatformDesireNextYear',
       'PlatformWorkedWith', 'PurchaseWhat', 'Sexuality', 'SOAccount',
       'SOComm', 'SOPartFreq', 'SOVisitFreq', 'SurveyEase', 'SurveyLength',
       'Trans', 'UndergradMajor', 'WebframeDesireNextYear',
  

In [126]:
df['YearsCode'] = df['YearsCode'].replace('Less than 1 year', 0.5)
df['YearsCode'] = df['YearsCode'].replace('More than 50 years', 0.5)

In [131]:
df['YearsCode'].astype(float).sum()/len(df)

11.266750438249485