# Importing all the libraries

In [1]:
import pandas as pd
import numpy as np

# Loading the Dataset

In [2]:
data = pd.read_csv("adult.csv")

# Exploring the Dataset

In [3]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [4]:
data.shape

(32561, 15)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education.num   32561 non-null  int64 
 5   marital.status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital.gain    32561 non-null  int64 
 11  capital.loss    32561 non-null  int64 
 12  hours.per.week  32561 non-null  int64 
 13  native.country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [6]:
print("Number of unique values in age = ", len(data["age"].unique()))
print("Number of unique values in workclass = ", len(data["workclass"].unique()))
print("Number of unique values in fnlwgt = ", len(data["fnlwgt"].unique()))
print("Number of unique values in education = ", len(data["education"].unique()))      # business_city has only one value San Francisco so it is not a necessary column. Hence we can remove it
print("Number of unique values in education.num = ", len(data["education.num"].unique()))      # business_state has only one value CA so it is not a necessary column. Hence we can remove it
print("Number of unique values in marital.status = ", len(data["marital.status"].unique()))
print("Number of unique values in occupation = ", len(data["occupation"].unique()))
print("Number of unique values in relationship = ", len(data["relationship"].unique()))
print("Number of unique values in race = ", len(data["race"].unique()))
print("Number of unique values in sex = ", len(data["sex"].unique()))
print("Number of unique values in capital.gain = ", len(data["capital.gain"].unique()))
print("Number of unique values in capital.loss = ", len(data["capital.loss"].unique()))
print("Number of unique values in hours.per.week = ", len(data["hours.per.week"].unique()))
print("Number of unique values in native.country = ", len(data["native.country"].unique()))
print("Number of unique values in income = ", len(data["income"].unique()))

Number of unique values in age =  73
Number of unique values in workclass =  9
Number of unique values in fnlwgt =  21648
Number of unique values in education =  16
Number of unique values in education.num =  16
Number of unique values in marital.status =  7
Number of unique values in occupation =  15
Number of unique values in relationship =  6
Number of unique values in race =  5
Number of unique values in sex =  2
Number of unique values in capital.gain =  119
Number of unique values in capital.loss =  92
Number of unique values in hours.per.week =  94
Number of unique values in native.country =  42
Number of unique values in income =  2


# Checking whether the nan values is present or not

In [7]:
data.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education.num',
       'marital.status', 'occupation', 'relationship', 'race', 'sex',
       'capital.gain', 'capital.loss', 'hours.per.week', 'native.country',
       'income'],
      dtype='object')

In [8]:
# no nan values are present but there might be some other uncleaned values that need to be cleaned
print("Number of nan values present in age",len(data[data["age"].isnull()]))
print("Number of nan values present in workclass",len(data[data["workclass"].isnull()]))
print("Number of nan values present in fnlwgt",len(data[data["fnlwgt"].isnull()]))
print("Number of nan values present in education",len(data[data["education"].isnull()]))
print("Number of nan values present in education.num",len(data[data["education.num"].isnull()]))
print("Number of nan values present in marital.status",len(data[data["marital.status"].isnull()]))
print("Number of nan values present in occupation",len(data[data["occupation"].isnull()]))
print("Number of nan values present in relationship",len(data[data["relationship"].isnull()]))
print("Number of nan values present in race",len(data[data["race"].isnull()]))
print("Number of nan values present in sex",len(data[data["sex"].isnull()]))
print("Number of nan values present in capital.gain",len(data[data["capital.gain"].isnull()]))
print("Number of nan values present in capital.loss",len(data[data["capital.loss"].isnull()]))
print("Number of nan values present in hours.per.week",len(data[data["hours.per.week"].isnull()]))
print("Number of nan values present in native.country",len(data[data["native.country"].isnull()]))
print("Number of nan values present in income",len(data[data["income"].isnull()]))

Number of nan values present in age 0
Number of nan values present in workclass 0
Number of nan values present in fnlwgt 0
Number of nan values present in education 0
Number of nan values present in education.num 0
Number of nan values present in marital.status 0
Number of nan values present in occupation 0
Number of nan values present in relationship 0
Number of nan values present in race 0
Number of nan values present in sex 0
Number of nan values present in capital.gain 0
Number of nan values present in capital.loss 0
Number of nan values present in hours.per.week 0
Number of nan values present in native.country 0
Number of nan values present in income 0


### Tackling the workclass of dataset

In [15]:
data["workclass"].unique()    # in this '?' is the uncleaned values

array(['Not Specified', 'Private', 'State-gov', 'Federal-gov',
       'Self-emp-not-inc', 'Self-emp-inc', 'Local-gov', 'Without-pay',
       'Never-worked'], dtype=object)

In [10]:
# Program to count repetition of same value in a column
def count_specified_values(df, column, specified_values):
    return df[column].isin(specified_values).sum()

In [13]:
count = count_specified_values(data, 'workclass', ['?'])
count

1836

In [14]:
data['workclass'] = data["workclass"].replace('?','Not Specified')

### Tackling the fnlwgt of dataset

In [16]:
data["fnlwgt"].unique()    # This column is already cleaned

array([ 77053, 132870, 186061, ...,  34066,  84661, 257302], dtype=int64)

### Tackling the education of dataset

In [19]:
data["education"].unique()     # This column doesn't any unknown and it is cleaned

array(['HS-grad', 'Some-college', '7th-8th', '10th', 'Doctorate',
       'Prof-school', 'Bachelors', 'Masters', '11th', 'Assoc-acdm',
       'Assoc-voc', '1st-4th', '5th-6th', '12th', '9th', 'Preschool'],
      dtype=object)

### Tackling the education.num of dataset

In [21]:
data["education.num"].unique()      # This column is already cleaned

array([ 9, 10,  4,  6, 16, 15, 13, 14,  7, 12, 11,  2,  3,  8,  5,  1],
      dtype=int64)

### Tackling the marital.status of dataset

In [22]:
data["marital.status"].unique()       # This column is already cleaned

array(['Widowed', 'Divorced', 'Separated', 'Never-married',
       'Married-civ-spouse', 'Married-spouse-absent', 'Married-AF-spouse'],
      dtype=object)

### Tackling the occupation of dataset

In [26]:
data["occupation"].unique()        # in this '?' is the uncleaned values

array(['Not Specified by the user', 'Exec-managerial',
       'Machine-op-inspct', 'Prof-specialty', 'Other-service',
       'Adm-clerical', 'Craft-repair', 'Transport-moving',
       'Handlers-cleaners', 'Sales', 'Farming-fishing', 'Tech-support',
       'Protective-serv', 'Armed-Forces', 'Priv-house-serv'], dtype=object)

In [24]:
count = count_specified_values(data, 'occupation', ['?'])
count

1843

In [25]:
data['occupation'] = data["occupation"].replace('?','Not Specified by the user')

### Tackling the relationship of dataset

In [27]:
data["relationship"].unique()        # This column is already cleaned

array(['Not-in-family', 'Unmarried', 'Own-child', 'Other-relative',
       'Husband', 'Wife'], dtype=object)

### Tackling the race of dataset

In [28]:
data["race"].unique()        # This column is already cleaned

array(['White', 'Black', 'Asian-Pac-Islander', 'Other',
       'Amer-Indian-Eskimo'], dtype=object)

### Tackling the sex of dataset

In [29]:
data["sex"].unique()        # This column is already cleaned

array(['Female', 'Male'], dtype=object)

### Tackling the capital.gain of dataset

In [32]:
data["capital.gain"].unique()       # This column is already cleaned

array([    0, 99999, 41310, 34095, 27828, 25236, 25124, 22040, 20051,
       18481, 15831, 15024, 15020, 14344, 14084, 13550, 11678, 10605,
       10566, 10520,  9562,  9386,  8614,  7978,  7896,  7688,  7443,
        7430,  7298,  6849,  6767,  6723,  6514,  6497,  6418,  6360,
        6097,  5721,  5556,  5455,  5178,  5060,  5013,  4934,  4931,
        4865,  4787,  4687,  4650,  4508,  4416,  4386,  4101,  4064,
        3942,  3908,  3887,  3818,  3781,  3674,  3471,  3464,  3456,
        3432,  3418,  3411,  3325,  3273,  3137,  3103,  2993,  2977,
        2964,  2961,  2936,  2907,  2885,  2829,  2653,  2635,  2597,
        2580,  2538,  2463,  2414,  2407,  2387,  2354,  2346,  2329,
        2290,  2228,  2202,  2176,  2174,  2105,  2062,  2050,  2036,
        2009,  1848,  1831,  1797,  1639,  1506,  1471,  1455,  1424,
        1409,  1173,  1151,  1111,  1086,  1055,   991,   914,   594,
         401,   114], dtype=int64)

### Tackling the capital.loss of dataset

In [33]:
data["capital.loss"].unique()        # This column is already cleaned

array([4356, 3900, 3770, 3683, 3004, 2824, 2754, 2603, 2559, 2547, 2489,
       2472, 2467, 2457, 2444, 2415, 2392, 2377, 2352, 2339, 2282, 2267,
       2258, 2246, 2238, 2231, 2206, 2205, 2201, 2179, 2174, 2163, 2149,
       2129, 2080, 2057, 2051, 2042, 2002, 2001, 1980, 1977, 1974, 1944,
       1902, 1887, 1876, 1848, 1844, 1825, 1816, 1762, 1755, 1741, 1740,
       1735, 1726, 1721, 1719, 1672, 1669, 1668, 1651, 1648, 1628, 1617,
       1602, 1594, 1590, 1579, 1573, 1564, 1539, 1504, 1485, 1411, 1408,
       1380, 1340, 1258, 1138, 1092,  974,  880,  810,  653,  625,  419,
        323,  213,  155,    0], dtype=int64)

### Tackling the hours.per.week of dataset

In [35]:
data["hours.per.week"].unique()     # This column is already cleaned

array([40, 18, 45, 20, 60, 35, 55, 76, 50, 42, 25, 32, 90, 48, 15, 70, 52,
       72, 39,  6, 65, 12, 80, 67, 99, 30, 75, 26, 36, 10, 84, 38, 62, 44,
        8, 28, 59,  5, 24, 57, 34, 37, 46, 56, 41, 98, 43, 63,  1, 47, 68,
       54,  2, 16,  9,  3,  4, 33, 23, 22, 64, 51, 19, 58, 53, 96, 66, 21,
        7, 13, 27, 11, 14, 77, 31, 78, 49, 17, 85, 87, 88, 73, 89, 97, 94,
       29, 82, 86, 91, 81, 92, 61, 74, 95], dtype=int64)

### Tackling the native.country of dataset

In [39]:
data["native.country"].unique()     # in this '?' is the uncleaned values

array(['United-States', 'Unknown', 'Mexico', 'Greece', 'Vietnam', 'China',
       'Taiwan', 'India', 'Philippines', 'Trinadad&Tobago', 'Canada',
       'South', 'Holand-Netherlands', 'Puerto-Rico', 'Poland', 'Iran',
       'England', 'Germany', 'Italy', 'Japan', 'Hong', 'Honduras', 'Cuba',
       'Ireland', 'Cambodia', 'Peru', 'Nicaragua', 'Dominican-Republic',
       'Haiti', 'El-Salvador', 'Hungary', 'Columbia', 'Guatemala',
       'Jamaica', 'Ecuador', 'France', 'Yugoslavia', 'Scotland',
       'Portugal', 'Laos', 'Thailand', 'Outlying-US(Guam-USVI-etc)'],
      dtype=object)

In [37]:
count = count_specified_values(data, 'native.country', ['?'])
count

583

In [38]:
data['native.country'] = data["native.country"].replace('?','Unknown')

### Tackling the income of dataset

In [40]:
data["income"].unique()        # This column is already cleaned

array(['<=50K', '>50K'], dtype=object)

# All the columns in the dataset are cleaned now