# Demographic Data Analyzer

## Importing the dataset (and Pandas)

In [64]:
import pandas as pd

In [65]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [66]:
Path_to_data = '/content/drive/My Drive/Datasets/'

In [67]:
df = pd.read_csv(Path_to_data + 'adult.data.csv')

In [68]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


## How many people of each race are represented in this dataset?

In [69]:
race_count = df["race"].value_counts()

In [70]:
race_count

White                 27816
Black                  3124
Asian-Pac-Islander     1039
Amer-Indian-Eskimo      311
Other                   271
Name: race, dtype: int64

## What is the average age of men?

In [71]:
average_age_men = df[df['sex'] == 'Male']["age"].mean()

In [72]:
round(average_age_men, 1)

39.4

## What is the percentage of people who have a Bachelor's degree?

In [73]:
num = df["education"].value_counts()

In [74]:
num

HS-grad         10501
Some-college     7291
Bachelors        5355
Masters          1723
Assoc-voc        1382
11th             1175
Assoc-acdm       1067
10th              933
7th-8th           646
Prof-school       576
9th               514
12th              433
Doctorate         413
5th-6th           333
1st-4th           168
Preschool          51
Name: education, dtype: int64

In [75]:
percentage_bachelors = (num["Bachelors"] / df["education"].value_counts().sum() * 100).round(1)

In [76]:
percentage_bachelors

16.4

## What percentage of people with advanced education (Bachelors, Masters, or Doctorate) make more than 50K?

In [77]:
higher_education = df.loc[df["education"].isin(["Bachelors", "Masters", "Doctorate"])]

In [78]:
higher_education

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32538,38,Private,139180,Bachelors,13,Divorced,Prof-specialty,Unmarried,Black,Female,15020,0,45,United-States,>50K
32539,71,?,287372,Doctorate,16,Married-civ-spouse,?,Husband,White,Male,0,0,10,United-States,>50K
32544,31,Private,199655,Masters,14,Divorced,Other-service,Not-in-family,Other,Female,0,0,30,United-States,<=50K
32553,32,Private,116138,Masters,14,Never-married,Tech-support,Not-in-family,Asian-Pac-Islander,Male,0,0,11,Taiwan,<=50K


In [79]:
new_df0 = higher_education["salary"].value_counts()

In [80]:
new_df0

<=50K    4005
>50K     3486
Name: salary, dtype: int64

In [81]:
higher_education_rich = (new_df0[">50K"] / higher_education["salary"].value_counts().sum() * 100).round(1)

In [82]:
higher_education_rich

46.5

## What percentage of people without advanced education make more than 50K?

In [83]:
lower_education = df.loc[-df["education"].isin(["Bachelors", "Masters", "Doctorate"])]

In [84]:
lower_education

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
10,37,Private,280464,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,Black,Male,0,0,80,United-States,>50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [85]:
new_df1 = lower_education["salary"].value_counts()

In [86]:
new_df1

<=50K    20715
>50K      4355
Name: salary, dtype: int64

In [87]:
lower_education_rich = (new_df1[">50K"] / lower_education["salary"].value_counts().sum() * 100).round(1)

In [88]:
lower_education_rich

17.4

## What is the minimum number of hours a person works per week?

In [89]:
min_work_hours = df["hours-per-week"].min()

In [90]:
min_work_hours

1

## What percentage of the people who work the minimum number of hours per week have a salary of more than 50K?

In [91]:
min_work_people = df[df["hours-per-week"] == min_work_hours]

In [92]:
min_work_people

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
189,58,State-gov,109567,Doctorate,16,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,1,United-States,>50K
1036,66,Self-emp-inc,150726,9th,5,Married-civ-spouse,Exec-managerial,Husband,White,Male,1409,0,1,?,<=50K
1262,69,?,195779,Assoc-voc,11,Widowed,?,Not-in-family,White,Female,0,0,1,United-States,<=50K
5590,78,?,363134,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,0,1,United-States,<=50K
5632,45,?,189564,Masters,14,Married-civ-spouse,?,Wife,White,Female,0,0,1,United-States,<=50K
5766,62,?,97231,Some-college,10,Married-civ-spouse,?,Wife,White,Female,0,0,1,United-States,<=50K
5808,76,?,211574,10th,6,Married-civ-spouse,?,Husband,White,Male,0,0,1,United-States,<=50K
8447,67,?,244122,Assoc-voc,11,Widowed,?,Not-in-family,White,Female,0,0,1,United-States,<=50K
9147,75,?,260543,10th,6,Widowed,?,Other-relative,Asian-Pac-Islander,Female,0,0,1,China,<=50K
11451,27,Private,147951,HS-grad,9,Never-married,Machine-op-inspct,Other-relative,White,Male,0,0,1,United-States,<=50K


In [93]:
numOfSalaries = min_work_people["salary"].value_counts()

In [94]:
numOfSalaries

<=50K    18
>50K      2
Name: salary, dtype: int64

In [95]:
num_min_workers = numOfSalaries.sum()

In [96]:
num_min_workers

20

In [97]:
rich_percentage = numOfSalaries[">50K"] / num_min_workers * 100

In [98]:
rich_percentage

10.0

## What country has the highest percentage of people that earn >50K and what is that percentage?

In [99]:
highest_earning_country = (df[df['salary'] == '>50K']['native-country'].value_counts()/ df['native-country'].value_counts() * 100).sort_values(ascending=False).fillna(0).idxmax()

In [100]:
highest_earning_country

'Iran'

In [101]:
highest_earning_country_percentage = (df[df['salary'] == '>50K']['native-country'].value_counts()/ df['native-country'].value_counts() * 100).sort_values(ascending=False).fillna(0).max()

In [102]:
round(highest_earning_country_percentage, 1)

41.9

## Identify the most popular occupation for those who earn >50K in India.

In [103]:
top_IN_occupation = df[df["salary"] == ">50K"][df["native-country"] == "India"]["occupation"].value_counts().idxmax()

  """Entry point for launching an IPython kernel.


In [104]:
top_IN_occupation

'Prof-specialty'