In [1]:
# Import required modules
import pandas as pd
import numpy as np

In [2]:
# Reading csv file into dataframe
file = pd.read_csv('./Input_Dataset.csv')
file.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [3]:
# Getting details about dataframe
file.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              48842 non-null  int64 
 1   workclass        48842 non-null  object
 2   fnlwgt           48842 non-null  int64 
 3   education        48842 non-null  object
 4   educational-num  48842 non-null  int64 
 5   marital-status   48842 non-null  object
 6   occupation       48842 non-null  object
 7   relationship     48842 non-null  object
 8   race             48842 non-null  object
 9   gender           48842 non-null  object
 10  capital-gain     48842 non-null  int64 
 11  capital-loss     48842 non-null  int64 
 12  hours-per-week   48842 non-null  int64 
 13  native-country   48842 non-null  object
 14  income           48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


In [4]:
# Renaming some columns as per convenience
file.rename(columns = {'native-country':'country', 
                       'educational-num':'education_level',
                       'hours-per-week':'weekly_hours',
                       'marital-status':'marital_status'}, inplace = True)

In [5]:
# Creating a new column containing net capital values
file['net_capital'] = file['capital-gain'] - file['capital-loss']

In [6]:
file.insert(12, 'net_capital', file.pop('net_capital'))

In [7]:
# Removing capital-gain and capital-loss columns
file.drop(['capital-gain', 'capital-loss', 'education'], axis=1, inplace=True)

In [8]:
# Checking for missing values in below mentioned columns
# and removing corresponding rows
file1 = file.drop(file[(file.workclass == '?') | (file.occupation == '?') | (file.country == '?')].index)

file1

Unnamed: 0,age,workclass,fnlwgt,education_level,marital_status,occupation,relationship,race,gender,net_capital,weekly_hours,country,income
0,25,Private,226802,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,40,United-States,<=50K
1,38,Private,89814,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,50,United-States,<=50K
2,28,Local-gov,336951,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,40,United-States,>50K
3,44,Private,160323,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,40,United-States,>50K
5,34,Private,198693,6,Never-married,Other-service,Not-in-family,White,Male,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,Private,257302,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,38,United-States,<=50K
48838,40,Private,154374,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,40,United-States,>50K
48839,58,Private,151910,9,Widowed,Adm-clerical,Unmarried,White,Female,0,40,United-States,<=50K
48840,22,Private,201490,9,Never-married,Adm-clerical,Own-child,White,Male,0,20,United-States,<=50K


In [9]:
# Another way of removing missing values, first replacing all '?' by Nan
file = file.replace('?', np.nan)

In [10]:
# Displays how many Nan values are there in each column
file.isna().sum()

age                   0
workclass          2799
fnlwgt                0
education_level       0
marital_status        0
occupation         2809
relationship          0
race                  0
gender                0
net_capital           0
weekly_hours          0
country             857
income                0
dtype: int64

In [11]:
# Displays only the rows where one or more column value is Nan
file[file.isna().any(axis=1)]

Unnamed: 0,age,workclass,fnlwgt,education_level,marital_status,occupation,relationship,race,gender,net_capital,weekly_hours,country,income
4,18,,103497,10,Never-married,,Own-child,White,Female,0,30,United-States,<=50K
6,29,,227026,9,Never-married,,Unmarried,Black,Male,0,40,United-States,<=50K
13,58,,299831,9,Married-civ-spouse,,Husband,White,Male,0,35,United-States,<=50K
19,40,Private,85019,16,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,45,,>50K
22,72,,132015,4,Divorced,,Not-in-family,White,Female,0,6,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...
48811,35,,320084,13,Married-civ-spouse,,Wife,White,Female,0,55,United-States,>50K
48812,30,,33811,13,Never-married,,Not-in-family,Asian-Pac-Islander,Female,0,99,United-States,<=50K
48820,71,,287372,16,Married-civ-spouse,,Husband,White,Male,0,10,United-States,>50K
48822,41,,202822,9,Separated,,Not-in-family,Black,Female,0,32,United-States,<=50K


In [12]:
# Removing all the rows where there are Nan values
file.dropna()

Unnamed: 0,age,workclass,fnlwgt,education_level,marital_status,occupation,relationship,race,gender,net_capital,weekly_hours,country,income
0,25,Private,226802,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,40,United-States,<=50K
1,38,Private,89814,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,50,United-States,<=50K
2,28,Local-gov,336951,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,40,United-States,>50K
3,44,Private,160323,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,40,United-States,>50K
5,34,Private,198693,6,Never-married,Other-service,Not-in-family,White,Male,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,Private,257302,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,38,United-States,<=50K
48838,40,Private,154374,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,40,United-States,>50K
48839,58,Private,151910,9,Widowed,Adm-clerical,Unmarried,White,Female,0,40,United-States,<=50K
48840,22,Private,201490,9,Never-married,Adm-clerical,Own-child,White,Male,0,20,United-States,<=50K


In [13]:
# Converting processed and clean dataframe to input.csv file
file1.to_csv('input.csv', index_label='id')