# Occupation

### Introduction:

Special thanks to: https://github.com/justmarkham for sharing the dataset and materials.

### Step 1. Import the necessary libraries

In [1]:
import pandas as pd

### Step 2. Import the dataset from this [address](https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.user). 

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.user', sep='|', index_col='user_id')

### Step 3. Assign it to a variable called users.

In [3]:
users = df
users

Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213
...,...,...,...,...
939,26,F,student,33319
940,32,M,administrator,02215
941,20,M,student,97229
942,48,F,librarian,78209


### Step 4. Discover what is the mean age per occupation

In [4]:
users.groupby('occupation').age.mean()

occupation
administrator    38.746835
artist           31.392857
doctor           43.571429
educator         42.010526
engineer         36.388060
entertainment    29.222222
executive        38.718750
healthcare       41.562500
homemaker        32.571429
lawyer           36.750000
librarian        40.000000
marketing        37.615385
none             26.555556
other            34.523810
programmer       33.121212
retired          63.071429
salesman         35.666667
scientist        35.548387
student          22.081633
technician       33.148148
writer           36.311111
Name: age, dtype: float64

### Step 5. Discover the Male ratio per occupation and sort it from the most to the least

In [5]:
# Mark the males in each row
# M -> 1
# F -> 0
def gender_to_numeric(x):
    if x == 'M':
        return 1
    if x == 'F':
        return 0
users['male_count'] = users['gender'].apply(gender_to_numeric)
users

Unnamed: 0_level_0,age,gender,occupation,zip_code,male_count
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,24,M,technician,85711,1
2,53,F,other,94043,0
3,23,M,writer,32067,1
4,24,M,technician,43537,1
5,33,F,other,15213,0
...,...,...,...,...,...
939,26,F,student,33319,0
940,32,M,administrator,02215,1
941,20,M,student,97229,1
942,48,F,librarian,78209,0


In [6]:
users_by_gender = users.groupby(['occupation', 'gender']).size().unstack()
users_by_gender['M']

occupation
administrator     43.0
artist            15.0
doctor             7.0
educator          69.0
engineer          65.0
entertainment     16.0
executive         29.0
healthcare         5.0
homemaker          1.0
lawyer            10.0
librarian         22.0
marketing         16.0
none               5.0
other             69.0
programmer        60.0
retired           13.0
salesman           9.0
scientist         28.0
student          136.0
technician        26.0
writer            26.0
Name: M, dtype: float64

In [8]:
# count the number of people (both F and M) in each occupation
users.occupation.value_counts()

occupation
student          196
other            105
educator          95
administrator     79
engineer          67
programmer        66
librarian         51
writer            45
executive         32
scientist         31
artist            28
technician        27
marketing         26
entertainment     18
healthcare        16
retired           14
lawyer            12
salesman          12
none               9
homemaker          7
doctor             7
Name: count, dtype: int64

In [13]:
# count how many times each occupation appears
occupation_counts = users.occupation.value_counts()

In [None]:
# sum up all the males in each occupation
male_ratio = users.groupby('occupation')['male_count'].sum() / occupation_counts
male_ratio.sort_values()

occupation
homemaker        0.142857
healthcare       0.312500
librarian        0.431373
artist           0.535714
administrator    0.544304
none             0.555556
writer           0.577778
marketing        0.615385
other            0.657143
student          0.693878
educator         0.726316
salesman         0.750000
lawyer           0.833333
entertainment    0.888889
scientist        0.903226
executive        0.906250
programmer       0.909091
retired          0.928571
technician       0.962963
engineer         0.970149
doctor           1.000000
dtype: float64

In [15]:
male_ratio = users_by_gender['M'] / occupation_counts
male_ratio.sort_values()

occupation
homemaker        0.142857
healthcare       0.312500
librarian        0.431373
artist           0.535714
administrator    0.544304
none             0.555556
writer           0.577778
marketing        0.615385
other            0.657143
student          0.693878
educator         0.726316
salesman         0.750000
lawyer           0.833333
entertainment    0.888889
scientist        0.903226
executive        0.906250
programmer       0.909091
retired          0.928571
technician       0.962963
engineer         0.970149
doctor           1.000000
dtype: float64

### Step 6. For each occupation, calculate the minimum and maximum ages

In [16]:
age_min_max = users.groupby('occupation')['age'].agg(['min', 'max'])
age_min_max

Unnamed: 0_level_0,min,max
occupation,Unnamed: 1_level_1,Unnamed: 2_level_1
administrator,21,70
artist,19,48
doctor,28,64
educator,23,63
engineer,22,70
entertainment,15,50
executive,22,69
healthcare,22,62
homemaker,20,50
lawyer,21,53


### Step 7. For each combination of occupation and gender, calculate the mean age

In [17]:
users.groupby(['occupation', 'gender']).age.mean()

occupation     gender
administrator  F         40.638889
               M         37.162791
artist         F         30.307692
               M         32.333333
doctor         M         43.571429
educator       F         39.115385
               M         43.101449
engineer       F         29.500000
               M         36.600000
entertainment  F         31.000000
               M         29.000000
executive      F         44.000000
               M         38.172414
healthcare     F         39.818182
               M         45.400000
homemaker      F         34.166667
               M         23.000000
lawyer         F         39.500000
               M         36.200000
librarian      F         40.000000
               M         40.000000
marketing      F         37.200000
               M         37.875000
none           F         36.500000
               M         18.600000
other          F         35.472222
               M         34.028986
programmer     F         32.16666

### Step 8.  For each occupation present the percentage of women and men

In [18]:
users_by_gender

gender,F,M
occupation,Unnamed: 1_level_1,Unnamed: 2_level_1
administrator,36.0,43.0
artist,13.0,15.0
doctor,,7.0
educator,26.0,69.0
engineer,2.0,65.0
entertainment,2.0,16.0
executive,3.0,29.0
healthcare,11.0,5.0
homemaker,6.0,1.0
lawyer,2.0,10.0


In [19]:
women = users_by_gender['F']
women = women.fillna(0)
women

occupation
administrator    36.0
artist           13.0
doctor            0.0
educator         26.0
engineer          2.0
entertainment     2.0
executive         3.0
healthcare       11.0
homemaker         6.0
lawyer            2.0
librarian        29.0
marketing        10.0
none              4.0
other            36.0
programmer        6.0
retired           1.0
salesman          3.0
scientist         3.0
student          60.0
technician        1.0
writer           19.0
Name: F, dtype: float64

In [20]:
men = users_by_gender['M']
men

occupation
administrator     43.0
artist            15.0
doctor             7.0
educator          69.0
engineer          65.0
entertainment     16.0
executive         29.0
healthcare         5.0
homemaker          1.0
lawyer            10.0
librarian         22.0
marketing         16.0
none               5.0
other             69.0
programmer        60.0
retired           13.0
salesman           9.0
scientist         28.0
student          136.0
technician        26.0
writer            26.0
Name: M, dtype: float64

In [21]:
gender_percentage = pd.DataFrame({
    '% Male': men / occupation_counts * 100,
    '% Female': women / occupation_counts * 100
}).round(1)

gender_percentage

Unnamed: 0_level_0,% Male,% Female
occupation,Unnamed: 1_level_1,Unnamed: 2_level_1
administrator,54.4,45.6
artist,53.6,46.4
doctor,100.0,0.0
educator,72.6,27.4
engineer,97.0,3.0
entertainment,88.9,11.1
executive,90.6,9.4
healthcare,31.2,68.8
homemaker,14.3,85.7
lawyer,83.3,16.7
