In [125]:
import pandas as pd
import numpy as np


def calculate_demographic_data(print_data=True):
    # Read data from file
    df = pd.read_csv('adult.data.csv')

    # How many of each race are represented in this dataset? This should be a Pandas series with race names as the index labels.
    race_count = pd.Series(data = sorted([df['race'][df['race'] == race].count() for race in set(df['race'])], reverse=True), index=set(df['race']))

    # What is the average age of men?
    average_age_men = round(np.mean(df['age'][df['sex'] == 'Male']), 1)

    # What is the percentage of people who have a Bachelor's degree?
    percentage_bachelors = round((len(df[df['education'] == 'Bachelors'])*100/len(df)), 1)

    # What percentage of people with advanced education (`Bachelors`, `Masters`, or `Doctorate`) make more than 50K?
    # What percentage of people without advanced education make more than 50K?

    # with and without `Bachelors`, `Masters`, or `Doctorate`
    higher_education = ['Bachelors', 'Masters', 'Doctorate']
    lower_education = [deg for deg in set(df['education']) if deg not in higher_education]

    # percentage with salary >50K
    higher_education_rich = round((sum([df['salary'][df['education'] == deg][df['salary'] == '>50K']
    .count() for deg in higher_education]))*100/(sum([df['salary'][df['education'] == deg]
    .count() for deg in higher_education])), 1)
    lower_education_rich = round((sum([df['salary'][df['education'] == deg][df['salary'] == '>50K']
    .count() for deg in lower_education]))*100/(sum([df['salary'][df['education'] == deg]
    .count() for deg in lower_education])), 1)

    # What is the minimum number of hours a person works per week (hours-per-week feature)?
    min_work_hours = np.min(df['hours-per-week'])

    # What percentage of the people who work the minimum number of hours per week have a salary of >50K?
    num_min_workers = df['hours-per-week'] == min(df['hours-per-week'])

    rich_percentage = int(len(df[df['salary'] == '>50K'][num_min_workers])*100/len(df['salary'][num_min_workers]))

    # What country has the highest percentage of people that earn >50K?
    salary_above_50 = {}
    for country in set(df['native-country'][df['salary'] == '>50K']):
        salary_above_50[country] = salary_above_50.get(country, df['native-country'][df['native-country'] == country][df['salary'] == '>50K']
        .count()/df['native-country'][df['native-country'] == country]
        .count())
        
    highest_earning_country = str
    for key, val in salary_above_50.items(): 
        if val == max(list(salary_above_50.values())): 
            highest_earning_country = key

    highest_earning_country_percentage = round(max(list(salary_above_50.values()))*100, 1)

    # Identify the most popular occupation for those who earn >50K in India.
    occ_sensus = {}
    for occ in set(df['occupation'][df['native-country'] == 'India'][df['salary'] == '>50K']):
        occ_sensus[occ] = occ_sensus.get(occ, df['occupation'][df['native-country'] == 'India'][df['salary'] == '>50K'][df['occupation'] == occ].count())
        
    top_IN_occupation = str
    for key, val in occ_sensus.items():
        if val == max(list(occ_sensus.values())):
            top_IN_occupation = key

    # DO NOT MODIFY BELOW THIS LINE

    if print_data:
        print("Number of each race:\n", race_count) 
        print("Average age of men:", average_age_men)
        print(f"Percentage with Bachelors degrees: {percentage_bachelors}%")
        print(f"Percentage with higher education that earn >50K: {higher_education_rich}%")
        print(f"Percentage without higher education that earn >50K: {lower_education_rich}%")
        print(f"Min work time: {min_work_hours} hours/week")
        print(f"Percentage of rich among those who work fewest hours: {rich_percentage}%")
        print("Country with highest percentage of rich:", highest_earning_country)
        print(f"Highest percentage of rich people in country: {highest_earning_country_percentage}%")
        print("Top occupations in India:", top_IN_occupation)

    return {
        'race_count': race_count,
        'average_age_men': average_age_men,
        'percentage_bachelors': percentage_bachelors,
        'higher_education_rich': higher_education_rich,
        'lower_education_rich': lower_education_rich,
        'min_work_hours': min_work_hours,
        'rich_percentage': rich_percentage,
        'highest_earning_country': highest_earning_country,
        'highest_earning_country_percentage':
        highest_earning_country_percentage,
        'top_IN_occupation': top_IN_occupation
    }


In [120]:
df = pd.read_csv('adult.data.csv')
salary_above_50 = {}
for country in set(df['native-country'][df['salary'] == '>50K']):
    salary_above_50[country] = salary_above_50.get(country, df['native-country'][df['native-country'] == country][df['salary'] == '>50K']
    .count()/df['native-country'][df['native-country'] == country]
    .count())
print(max(salary_above_50.values()))
lis_country = list(salary_above_50.keys())
lis_percentage = list(salary_above_50.values())
print(lis_percentage)
print(lis_country.index(max(lis_percentage)))






0.4186046511627907
[0.25, 0.2504288164665523, 0.05132192846034215, 0.41379310344827586, 0.058823529411764705, 0.4, 0.08490566037735849, 0.2, 0.3, 0.26666666666666666, 0.30808080808080807, 0.10526315789473684, 0.03389830508474576, 0.046875, 0.06451612903225806, 0.3424657534246575, 0.3333333333333333, 0.07692307692307693, 0.20833333333333334, 0.10526315789473684, 0.12345679012345678, 0.375, 0.3684210526315789, 0.02857142857142857, 0.2, 0.32116788321167883, 0.4186046511627907, 0.14285714285714285, 0.32231404958677684, 0.09090909090909091, 0.27586206896551724, 0.24583476174151525, 0.16666666666666666, 0.07462686567164178, 0.39215686274509803, 0.3870967741935484, 0.23076923076923078, 0.2631578947368421, 0.10810810810810811, 0.1111111111111111]


ValueError: 0.4186046511627907 is not in list

In [80]:
rich_percentage = (df['salary'][df['salary'] == '>50K'][num_min_workers].count())*100/(df['salary'][num_min_workers].count())

NameError: name 'num_min_workers' is not defined

In [126]:
calculate_demographic_data(print_data=True)

  rich_percentage = int(len(df[df['salary'] == '>50K'][num_min_workers])*100/len(df['salary'][num_min_workers]))


Number of each race:
 Amer-Indian-Eskimo    27816
Other                  3124
White                  1039
Asian-Pac-Islander      311
Black                   271
dtype: int64
Average age of men: 39.4
Percentage with Bachelors degrees: 16.4%
Percentage with higher education that earn >50K: 46.5%
Percentage without higher education that earn >50K: 17.4%
Min work time: 1 hours/week
Percentage of rich among those who work fewest hours: 10%
Country with highest percentage of rich: Iran
Highest percentage of rich people in country: 41.9%
Top occupations in India: Prof-specialty


{'race_count': Amer-Indian-Eskimo    27816
 Other                  3124
 White                  1039
 Asian-Pac-Islander      311
 Black                   271
 dtype: int64,
 'average_age_men': 39.4,
 'percentage_bachelors': 16.4,
 'higher_education_rich': 46.5,
 'lower_education_rich': 17.4,
 'min_work_hours': 1,
 'rich_percentage': 10,
 'highest_earning_country': 'Iran',
 'highest_earning_country_percentage': 41.9,
 'top_IN_occupation': 'Prof-specialty'}