# Analysis of US Adult Census Data, 1994

## Importing Numpy and Pandas for data analysis

In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.precision',2)

## Loading online dataset

In [2]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', names = ['Age','Workclass','Fnlwgt','Education','Education-num','Marital-status','Occupation','Relationship',
                                 'Race','Sex','Capital-Gain','Capital-Loss','Hours-per-Week','Native-Country','Salary'])
df.head(10)

Unnamed: 0,Age,Workclass,Fnlwgt,Education,Education-num,Marital-status,Occupation,Relationship,Race,Sex,Capital-Gain,Capital-Loss,Hours-per-Week,Native-Country,Salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


## Getting the size of the data

In [3]:
df.shape

(32561, 15)

## Analysing each feature of the dataset

In [4]:
df.describe()

Unnamed: 0,Age,Fnlwgt,Education-num,Capital-Gain,Capital-Loss,Hours-per-Week
count,32561.0,32600.0,32561.0,32561.0,32561.0,32561.0
mean,38.58,190000.0,10.08,1077.65,87.3,40.44
std,13.64,106000.0,2.57,7385.29,402.96,12.35
min,17.0,12300.0,1.0,0.0,0.0,1.0
25%,28.0,118000.0,9.0,0.0,0.0,40.0
50%,37.0,178000.0,10.0,0.0,0.0,40.0
75%,48.0,237000.0,12.0,0.0,0.0,45.0
max,90.0,1480000.0,16.0,99999.0,4356.0,99.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Age             32561 non-null  int64 
 1   Workclass       32561 non-null  object
 2   Fnlwgt          32561 non-null  int64 
 3   Education       32561 non-null  object
 4   Education-num   32561 non-null  int64 
 5   Marital-status  32561 non-null  object
 6   Occupation      32561 non-null  object
 7   Relationship    32561 non-null  object
 8   Race            32561 non-null  object
 9   Sex             32561 non-null  object
 10  Capital-Gain    32561 non-null  int64 
 11  Capital-Loss    32561 non-null  int64 
 12  Hours-per-Week  32561 non-null  int64 
 13  Native-Country  32561 non-null  object
 14  Salary          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


## Analysing the number of males and females

In [6]:
df.describe(include = ['object'])['Sex']

count     32561
unique        2
top        Male
freq      21790
Name: Sex, dtype: object

In [7]:
df['Sex'].value_counts()

 Male      21790
 Female    10771
Name: Sex, dtype: int64

In [8]:
df[df['Sex']==' Female']['Age'].mean()

36.85823043357163

## Analysing percent of German and Indians in US Census Data, 1994

In [9]:
df['Native-Country'].value_counts()

 United-States                 29170
 Mexico                          643
 ?                               583
 Philippines                     198
 Germany                         137
 Canada                          121
 Puerto-Rico                     114
 El-Salvador                     106
 India                           100
 Cuba                             95
 England                          90
 Jamaica                          81
 South                            80
 China                            75
 Italy                            73
 Dominican-Republic               70
 Vietnam                          67
 Guatemala                        64
 Japan                            62
 Poland                           60
 Columbia                         59
 Taiwan                           51
 Haiti                            44
 Iran                             43
 Portugal                         37
 Nicaragua                        34
 Peru                             31
 

In [10]:
no_of_germans = df[df['Native-Country']==' Germany'].count()['Native-Country']
total_natives = df.count()['Native-Country']
no_of_germans / total_natives * 100

0.42074874850281013

In [11]:
no_of_indians = df[df['Native-Country']==' India'].count()['Native-Country']
total_natives = df.count()['Native-Country']
no_of_indians / total_natives * 100

0.3071158748195694

## Analysing the relation between the age and Salary

In [12]:
age_rich = df.loc[df['Salary'] == ' >50K','Age']
age_poor = df.loc[df['Salary'] == ' <=50K','Age']
print(f'The avg. age of rich are {round(age_rich.mean())}+-{round(age_rich.std())} and avg. age of poor are {round(age_poor.mean())}+-{round(age_poor.std())} ')

The avg. age of rich are 44+-11 and avg. age of poor are 37+-14 


## Analysing if the people who are just high school grad are earning more than 50K

In [13]:
df.loc[df['Salary']==' >50K','Education'].value_counts()[-6:]

 11th       60
 7th-8th    40
 12th       33
 9th        27
 5th-6th    16
 1st-4th     6
Name: Education, dtype: int64

## Which is the maximum life of American-Indian-Eskimo race in USA?

In [14]:
df.loc[df['Race']==' Amer-Indian-Eskimo','Age'].max()

82

## Who earn more? Married or Singles?

In [15]:
married = df[df['Marital-status'].apply(lambda marry: marry[0:8]==' Married')]
married_rich = married.loc[married['Salary']==' >50K'].count()['Marital-status']

total_rich = df.loc[df['Salary'] == ' >50K'].count()['Marital-status']

single_rich = total_rich - married_rich
print(f'married people: {married_rich}, singles: {single_rich}')

married people: 6736, singles: 1105


## Relation between Hard Work and Money

In [16]:
hard_work = df.loc[df['Hours-per-Week'] == 99]
total_hard_work = hard_work.count()['Hours-per-Week']
hard_work_money = hard_work.loc[hard_work['Salary'] == ' >50K'].count()['Salary']

hard_work_money / total_hard_work * 100

29.411764705882355

## Analysing the immigrants of two countries (Japan, India) and finding out the relation between their number of working hours and salary

In [17]:
japan = df.loc[df['Salary']==' <=50K']
japan.loc[japan['Native-Country']==' Japan'].describe()

Unnamed: 0,Age,Fnlwgt,Education-num,Capital-Gain,Capital-Loss,Hours-per-Week
count,38.0,38.0,38.0,38.0,38.0,38.0
mean,35.32,197242.53,10.71,201.47,41.84,41.0
std,9.39,110715.68,1.84,702.34,257.93,11.9
min,19.0,29807.0,9.0,0.0,0.0,10.0
25%,29.0,136372.25,9.0,0.0,0.0,40.0
50%,34.0,167426.0,10.0,0.0,0.0,40.0
75%,41.0,249460.0,13.0,0.0,0.0,41.5
max,61.0,536725.0,14.0,2885.0,1590.0,65.0


In [18]:
japan = df.loc[df['Salary']==' >50K']
japan.loc[japan['Native-Country']==' Japan'].describe()

Unnamed: 0,Age,Fnlwgt,Education-num,Capital-Gain,Capital-Loss,Hours-per-Week
count,24.0,24.0,24.0,24.0,24.0,24.0
mean,42.88,167433.71,12.58,6258.75,82.38,47.96
std,6.77,131000.39,2.36,20485.67,403.55,16.12
min,33.0,22201.0,6.0,0.0,0.0,21.0
25%,37.75,81099.75,12.75,0.0,0.0,40.0
50%,41.0,124334.0,13.0,0.0,0.0,42.5
75%,47.25,195285.0,14.0,1196.75,0.0,50.0
max,56.0,586657.0,16.0,99999.0,1977.0,99.0


In [19]:
india = df.loc[df['Salary']==' <=50K']
india.loc[india['Native-Country']==' India'].describe()

Unnamed: 0,Age,Fnlwgt,Education-num,Capital-Gain,Capital-Loss,Hours-per-Week
count,60.0,60.0,60.0,60.0,60.0,60.0
mean,35.73,171203.47,11.48,209.92,26.32,38.23
std,11.45,80260.07,3.16,884.73,203.85,12.62
min,17.0,31740.0,1.0,0.0,0.0,8.0
25%,27.0,122145.25,10.0,0.0,0.0,35.0
50%,32.0,148114.0,13.0,0.0,0.0,40.0
75%,46.0,191359.25,14.0,0.0,0.0,40.0
max,61.0,361280.0,16.0,5013.0,1579.0,84.0


In [20]:
india = df.loc[df['Salary']==' >50K']
india.loc[india['Native-Country']==' India'].describe()

Unnamed: 0,Age,Fnlwgt,Education-num,Capital-Gain,Capital-Loss,Hours-per-Week
count,40.0,40.0,40.0,40.0,40.0,40.0
mean,41.62,164542.52,13.85,8697.42,288.32,46.48
std,9.92,75060.19,1.58,26507.59,705.69,10.06
min,23.0,23510.0,9.0,0.0,0.0,20.0
25%,34.0,115244.0,13.0,0.0,0.0,40.0
50%,41.5,143003.0,14.0,0.0,0.0,41.5
75%,48.75,211534.5,15.0,775.75,0.0,50.0
max,61.0,366957.0,16.0,99999.0,2415.0,72.0
