# Demographic Data analyser

### Import the libraries 

In [1]:
import numpy as np
import pandas as pd


In [2]:
column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 
                'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 
                'hours-per-week', 'native-country', 'income']

csv_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
    
df = pd.read_csv(csv_url,sep=',')
df.columns = column_names

### Number of people of each race that are represented in this dataset

In [4]:
def count_races():
    df['race'] = df['race'].str.strip()
    races = pd.Series(df['race'].value_counts())
    return races

count_races()

race
White                 27815
Black                  3124
Asian-Pac-Islander     1039
Amer-Indian-Eskimo      311
Other                   271
Name: count, dtype: int64

### Average age of men

In [5]:
def average_age_of_men():
    df['sex'] = df['sex'].str.strip()
    avg_age_men = df[df['sex'] == 'Male']['age'].mean()
    return round(avg_age_men,1)

average_age_of_men()

39.4

### Percentage of people having a Bachelor's degree

In [6]:
def percentage_of_Bachelors():
    df['education'] = df['education'].str.strip()
    df['income'] = df['income'].str.strip()
    percentage = df[df['education'] == "Bachelors"]['education'].size / df['education'].size * 100
    return round(percentage,1)

percentage_of_Bachelors()

16.4

### Percentage of educated people who earn more than 50K 

In [7]:
def advanced_educted_earning_more_than_50k():
    educational_levels = ['Bachelors','Masters','Doctorate']
    advanced_educated = df[df['education'].isin(educational_levels)]
    advanced_educated_more_than_50K = advanced_educated[advanced_educated['income'] == '>50K']['income']
    percentage_of_advanced_educated_earning_more_than_50K = advanced_educated_more_than_50K.size / advanced_educated['education'].size * 100
    return round(percentage_of_advanced_educated_earning_more_than_50K,1)

advanced_educted_earning_more_than_50k()

46.5

### Non-educated people who earn more than 50K

In [8]:
def non_advanced_educted_earning_more_than_50k():
    educational_levels = ['Bachelors','Masters','Doctorate']
    non_advanced_educated = df[~df['education'].isin(educational_levels)]
    non_advanced_educated_more_than_50K = non_advanced_educated[non_advanced_educated['income'] == '>50K']['income']
    percentage_of_non_advanced_educated_earning_more_than_50K = non_advanced_educated_more_than_50K.size / non_advanced_educated['education'].size * 100
    return round(percentage_of_non_advanced_educated_earning_more_than_50K,1)

non_advanced_educted_earning_more_than_50k()

17.4

### Minimum number of hours a person works per week

In [9]:
def minimum_no_of_hour_per_week():
    return min(df['hours-per-week'])

minimum_no_of_hour_per_week()

1

### Minimum number of hours a person works per week who earn more than 50K

In [10]:
def minimum_no_of_hour_per_week_earn_more_than_50k():
    minimum_no_of_work_per_week = df[df['hours-per-week'] < 2]
    minimum_no_of_work_per_week_earn_more_than_50k = minimum_no_of_work_per_week[minimum_no_of_work_per_week['income']== '>50K']['income']
    people_who_work_minimum_no_of_hours_with_income_50k = minimum_no_of_work_per_week_earn_more_than_50k.size /minimum_no_of_work_per_week['hours-per-week'].size * 100
    return people_who_work_minimum_no_of_hours_with_income_50k

minimum_no_of_hour_per_week_earn_more_than_50k()

10.0

### Country with the highest percentage of people that earn >50K and that percentage

In [11]:
def highest_percent_of_50k_earners():
    df['native-country'] = df['native-country'].str.strip()
    df['occupation'] = df['occupation'].str.strip()
    people_earn_more_than_50K = df[df['income'] == ">50K"]
    max_50k_earners = max(pd.Series(people_earn_more_than_50K['native-country']).value_counts())
    country = df.loc[max_50k_earners]['native-country']
    percentage = max_50k_earners / people_earn_more_than_50K['native-country'].size * 100
    return country,round(percentage,1)

highest_percent_of_50k_earners()

('United-States', 91.5)

### Most popular Occupation in india which earns more than 50K

In [12]:
def most_popular_occupation_in_india():
    people_earn_more_than_50K = df[df['income'] == ">50K"]
    people_earn_more_than_50K_india = people_earn_more_than_50K[people_earn_more_than_50K['native-country']== "India"]
    most_popular_occupation = max(pd.Series(people_earn_more_than_50K_india['occupation']).value_counts())
    return df.loc[most_popular_occupation]['occupation']

most_popular_occupation_in_india()


'Craft-repair'