In [2]:
import pandas as pd
import numpy as np


In [3]:
df = pd.read_csv('https://raw.githubusercontent.com/fuzzyray/demographic-data-analyzer/main/adult.data.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  native-country  32561 non-null  object
 14  salary          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [5]:
df.describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [92]:
def calculate_demographic_data(print_data = True):
  #how many people of each race are represented in this data.
  race_count = df['race'].value_counts()

  #what is the average age of men
  average_age_male = df.loc[df['sex'] == 'Male']['age'].mean().round(decimals=1)

  #what is the percentage of of people who have a bachelor's degree
  bachelors_count = df.loc[df['education']== 'Bachelors']['education'].count()
  total_count= df['education'].count()
  percentage_bachelors= (bachelors_count / total_count * 100).round(decimals=1)

  #what percentage of people with advanced education (bachelors, master, doctorate) make more than 50k
  #what percebntage of people without advanced education make more than 50k
  education_salary_df = pd.DataFrame(df.groupby(df['education'])['salary'].value_counts())
  education_salary_df = education_salary_df.rename(columns={"salary":"counts"})
  high_salary_df = education_salary_df.loc[(slice(None), '>50K'), :]

  #with and without bachelors, masters, doctorate
  higher_education= education_salary_df.loc[['Bachelors', 'Masters', 'Doctorate']].sum()
  lower_education= high_salary_df.sum()-higher_education
  
  #percentage salary with over 50k
  high_education_rich_count= high_salary_df.loc[['Bachelors', 'Masters', 'Doctorate']].sum()
  low_education_rich_count= high_salary_df.sum()-high_education_rich_count
  
  higher_education_rich=(high_education_rich_count / higher_education * 100 ).round(decimals=1)
  lower_education_rich=(low_education_rich_count / lower_education *100).round(decimals=1)
  
  # What is the minimum number of hours a person works per week (hours-per-week feature)?
  min_work_hours = df['hours-per-week'].min()

  hours_worked_salary_df = pd.DataFrame(df.groupby(df['hours-per-week'])['salary'].value_counts())
  hours_worked_salary_df = hours_worked_salary_df.rename(columns={'salary':'counts'})
  min_hours_worked_salary_df = hours_worked_salary_df.loc[min_work_hours,:]
  
  # What percentage of the people who work the minimum number of hours per week have a salary of >50K?
  num_min_workers= min_hours_worked_salary_df.sum()
  rich_percentage = float((min_hours_worked_salary_df.loc['>50K'] / num_min_workers * 100).round(decimals=1))

  #what country has the highest percentage of people that earn >50k
  country_counts_df = pd.DataFrame(df.groupby(df['native-country'])['salary'].count())
  country_counts_df = country_counts_df.rename(columns={'salary':'counts'}).reset_index()
  country_rich_counts_df = pd.DataFrame(df.groupby(df['native-country'])['salary'].value_counts())
  country_rich_counts_df = country_rich_counts_df.loc[(slice(None), '>50K'), :]
  country_rich_counts_df= country_rich_counts_df.rename(columns={'salary': 'rich-counts'})
  country_rich_counts_df= country_rich_counts_df.reset_index()[['native-country', 'rich-counts']]
  country_counts_df = country_counts_df.merge(country_rich_counts_df, on= 'native-country')
  country_counts_df['rich-percent']= (country_counts_df['rich-counts'] / country_counts_df['counts'] * 100)
  country_counts_df['rich-percent']= country_counts_df['rich-counts'].round(decimals=1)
  top_country= country_counts_df.sort_values('rich-percent', ascending= False).head(1)

  highest_earned_country= top_country.iloc[0]['native-country']
  highest_earned_country_percentage = top_country.iloc[0]['rich-percent']

  #identify the most popular occupation for those who earned >50k in india
  india_df= df.loc[df['native-country']=='India']
  india_df = india_df.loc[df['salary'] == '>50K']
  india_df = pd.DataFrame(india_df.groupby('native-country')['occupation'].value_counts())
  india_df = india_df.rename(columns={'occupation':'counts'})
  india_df = india_df.reset_index().sort_values('counts', ascending= False).head(1)

  top_In_occupation = india_df.iloc[0]['occupation']

  if print_data:{
      print('Number of each race:\n', race_count),
      print("Average age of men:", average_age_male),
      print(f'Percentage with Bachelors degrees: {percentage_bachelors}%'),
      print(f'percentage with higher education that earn >50k: {higher_education_rich}%'),
      print(f'percentage without higher education that earn >50k: {lower_education_rich}%'),
      print(f'Min work time: {min_work_hours} hours/week'),
      print(f'percentage of rich among those who work fewest hours:{rich_percentage}%'),
      print(f'country with highest percentage of rich:', highest_earned_country),
      print(f'highest percentage of rich people in country:{highest_earned_country_percentage}%'),
      print('top occuations in india:', top_In_occupation)


  }

  return{
      'race_count':race_count,
      'average_age_male': average_age_male,
      'percentage_bachelors': percentage_bachelors,
      'higher_education_rich': higher_education_rich,
      'lower_education_rich': lower_education_rich,
      'min_work_hours': min_work_hours,
      'rich_percentage' : rich_percentage,
      'highest_earned_country': highest_earned_country,
      'highest_earned_country_percentage': highest_earned_country_percentage,
      'top_In_occupation': top_In_occupation
  }












  


In [93]:
calculate_demographic_data()

Number of each race:
 White                 27816
Black                  3124
Asian-Pac-Islander     1039
Amer-Indian-Eskimo      311
Other                   271
Name: race, dtype: int64
Average age of men: 39.4
Percentage with Bachelors degrees: 16.4%
percentage with higher education that earn >50k: counts    46.5
dtype: float64%
percentage without higher education that earn >50k: counts    1244.3
dtype: float64%
Min work time: 1 hours/week
percentage of rich among those who work fewest hours:10.0%
country with highest percentage of rich: United-States
highest percentage of rich people in country:7171%
top occuations in india: Prof-specialty


{'race_count': White                 27816
 Black                  3124
 Asian-Pac-Islander     1039
 Amer-Indian-Eskimo      311
 Other                   271
 Name: race, dtype: int64,
 'average_age_male': 39.4,
 'percentage_bachelors': 16.4,
 'higher_education_rich': counts    46.5
 dtype: float64,
 'lower_education_rich': counts    1244.3
 dtype: float64,
 'min_work_hours': 1,
 'rich_percentage': 10.0,
 'highest_earned_country': 'United-States',
 'highest_earned_country_percentage': 7171,
 'top_In_occupation': 'Prof-specialty'}