In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# %matplotlib inline

In [2]:
# Read data from file
df = pd.read_csv('adult.data.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  native-country  32561 non-null  object
 14  salary          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [4]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
df['salary'].unique()

array(['<=50K', '>50K'], dtype=object)

In [6]:
# How many of each race are represented in this dataset? This should be a Pandas series with race names as the index labels.
df['education'].value_counts()

education
HS-grad         10501
Some-college     7291
Bachelors        5355
Masters          1723
Assoc-voc        1382
11th             1175
Assoc-acdm       1067
10th              933
7th-8th           646
Prof-school       576
9th               514
12th              433
Doctorate         413
5th-6th           333
1st-4th           168
Preschool          51
Name: count, dtype: int64

In [7]:
df['sex'].value_counts()

sex
Male      21790
Female    10771
Name: count, dtype: int64

In [8]:
# What is the average age of men?
df[df['sex'] == 'Male']['age'].mean()

np.float64(39.43354749885268)

In [9]:
# What is the percentage of people who have a Bachelor's degree?
df['education'].value_counts()

education
HS-grad         10501
Some-college     7291
Bachelors        5355
Masters          1723
Assoc-voc        1382
11th             1175
Assoc-acdm       1067
10th              933
7th-8th           646
Prof-school       576
9th               514
12th              433
Doctorate         413
5th-6th           333
1st-4th           168
Preschool          51
Name: count, dtype: int64

In [10]:
#selects bachelors degrees
bach_select = df[df['education'] == 'Bachelors']

#counts bachelors degrees
num_bach = len(bach_select)

#count total people
total_people = len(df)

#percent of bachelors
percentage_bachelors = (num_bach / total_people) * 100
print(percentage_bachelors)

16.44605509658794


In [11]:
# What percentage of people with advanced education (`Bachelors`, `Masters`, or `Doctorate`) make more than 50K?

# select and count all higher education 
high_ed = df[df['education'].isin(['Bachelors', 'Doctorate', 'Masters'])]
num_high_ed = len(high_ed)

# selecting higher education that make over 50k
high_ed_over50 = df[df['education'].isin(['Bachelors', 'Doctorate', 'Masters']) & (df['salary'] == '>50K' )] 
num_high_ed_over50 = len(high_ed_over50)

# percent of high education making over 50k
higher_education = (num_high_ed_over50 / num_high_ed) * 100
print(higher_education)

46.535843011613935


In [12]:
# What percentage of people without advanced education make more than 50K?

#select and count low education 
low_ed = df[~df['education'].isin(['Bachelors', 'Doctorate', 'Masters'])]
num_low_ed = len(low_ed)

# selct and count lower education that make over 50k
low_ed_over50 = df[~df['education'].isin(
    ['Bachelors', 'Doctorate', 'Masters']) & (df['salary'] == '>50K')]
num_low_ed_over50 = len(low_ed_over50)

# calculate percent of low education who make over 50k
lower_education_rich = (num_low_ed_over50 / num_low_ed) * 100
print(lower_education_rich)

17.3713601914639


In [13]:
# What is the minimum number of hours a person works per week (hours-per-week feature)?

df['hours-per-week'].min()

np.int64(1)

In [14]:
# What percentage of the people who work the minimum number of hours per week have a salary of >50K?

# find min hours worked
min_hours = df['hours-per-week'].min()

#select and count workers who worked min hours
min_hours_group = df[df['hours-per-week'] == min_hours]
num_min_workers = len(min_hours_group)

#count how many min hour workers made over 50K
min_hours_over50K = (min_hours_group['salary'] == '>50K').sum()

#calculate percentage 
rich_percentage = (min_hours_over50K / num_min_workers) * 100
print(rich_percentage)



10.0


In [15]:
# What country has the highest percentage of people that earn >50K?

# group by country and count over 50K
NSgrouped = df.groupby('native-country')['salary'].value_counts().unstack(fill_value=0)

# get percent of over 50k per country
NSgrouped['Percent_over50'] = (NSgrouped['>50K'] / NSgrouped.sum(axis=1))*100

#find highest country over 50k
highest_earning_country = NSgrouped['Percent_over50'].idxmax()
highest_earning_country_percentage = NSgrouped['Percent_over50'].max()
print(highest_earning_country)
print(highest_earning_country_percentage)


Iran
41.86046511627907


In [16]:
# Identify the most popular occupation for those who earn >50K in India.

filter_df = df[(df['salary']=='>50K') & (df['native-country']== 'India')]

count_occupation = filter_df['occupation'].value_counts()

top_IN_occupation = count_occupation.idxmax()
print(top_IN_occupation)





Prof-specialty
