In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv(r'C:\Users\Admin\Documents\EXCELR\DS Assignments\@\adult_with_headers.csv')

# Basic data exploration
print("Dataset shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())

Dataset shape: (32561, 15)

First 5 rows:
   age          workclass  fnlwgt   education  education_num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital_status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital_gain  capital_loss  hours_per_week  native_country  income  
0          2174           

In [4]:
print("\nData types:")
print(df.dtypes)



Data types:
age                int64
workclass         object
fnlwgt             int64
education         object
education_num      int64
marital_status    object
occupation        object
relationship      object
race              object
sex               object
capital_gain       int64
capital_loss       int64
hours_per_week     int64
native_country    object
income            object
dtype: object


In [5]:
print("\nMissing values:")
print(df.isnull().sum())



Missing values:
age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64


In [6]:
print("\nSummary statistics:")
print(df.describe())


Summary statistics:
                age        fnlwgt  education_num  capital_gain  capital_loss  \
count  32561.000000  3.256100e+04   32561.000000  32561.000000  32561.000000   
mean      38.581647  1.897784e+05      10.080679   1077.648844     87.303830   
std       13.640433  1.055500e+05       2.572720   7385.292085    402.960219   
min       17.000000  1.228500e+04       1.000000      0.000000      0.000000   
25%       28.000000  1.178270e+05       9.000000      0.000000      0.000000   
50%       37.000000  1.783560e+05      10.000000      0.000000      0.000000   
75%       48.000000  2.370510e+05      12.000000      0.000000      0.000000   
max       90.000000  1.484705e+06      16.000000  99999.000000   4356.000000   

       hours_per_week  
count    32561.000000  
mean        40.437456  
std         12.347429  
min          1.000000  
25%         40.000000  
50%         40.000000  
75%         45.000000  
max         99.000000  


In [7]:
# Identify columns with missing values (represented as '?')
missing_cols = df.columns[df.isin(['?']).any()].tolist()
print("Columns with missing values:", missing_cols)

# Handle missing values
# For workclass, occupation, native_country - use mode imputation
for col in ['workclass', 'occupation', 'native_country']:
    mode_value = df[col][df[col] != '?'].mode()[0]
    df[col] = df[col].replace('?', mode_value)

# Verify no more missing values
print("\nMissing values after imputation:")
print(df.isin(['?']).sum())

Columns with missing values: []

Missing values after imputation:
age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64


In [8]:
# Identify numerical features
numerical_features = ['age', 'fnlwgt', 'education_num', 'capital_gain',
                     'capital_loss', 'hours_per_week']

# Apply Standard Scaling
standard_scaler = StandardScaler()
df_standard = df.copy()
df_standard[numerical_features] = standard_scaler.fit_transform(df[numerical_features])

# Apply Min-Max Scaling
minmax_scaler = MinMaxScaler()
df_minmax = df.copy()
df_minmax[numerical_features] = minmax_scaler.fit_transform(df[numerical_features])

print("Standard Scaling - First 5 rows:")
print(df_standard[numerical_features].head())
print("\nMin-Max Scaling - First 5 rows:")
print(df_minmax[numerical_features].head())

Standard Scaling - First 5 rows:
        age    fnlwgt  education_num  capital_gain  capital_loss  \
0  0.030671 -1.063611       1.134739      0.148453      -0.21666   
1  0.837109 -1.008707       1.134739     -0.145920      -0.21666   
2 -0.042642  0.245079      -0.420060     -0.145920      -0.21666   
3  1.057047  0.425801      -1.197459     -0.145920      -0.21666   
4 -0.775768  1.408176       1.134739     -0.145920      -0.21666   

   hours_per_week  
0       -0.035429  
1       -2.222153  
2       -0.035429  
3       -0.035429  
4       -0.035429  

Min-Max Scaling - First 5 rows:
        age    fnlwgt  education_num  capital_gain  capital_loss  \
0  0.301370  0.044302       0.800000       0.02174           0.0   
1  0.452055  0.048238       0.800000       0.00000           0.0   
2  0.287671  0.138113       0.533333       0.00000           0.0   
3  0.493151  0.151068       0.400000       0.00000           0.0   
4  0.150685  0.221488       0.800000       0.00000           0.0 

In [9]:
# Identify categorical features
categorical_features = ['workclass', 'education', 'marital_status', 'occupation',
                       'relationship', 'race', 'sex', 'native_country']

# One-Hot Encoding for variables with less than 5 categories
categorical_less_than_5 = []
categorical_more_than_5 = []

for col in categorical_features:
    if df[col].nunique() < 5:
        categorical_less_than_5.append(col)
    else:
        categorical_more_than_5.append(col)

print("Categorical features with <5 categories:", categorical_less_than_5)
print("Categorical features with >=5 categories:", categorical_more_than_5)

# Apply One-Hot Encoding
df_encoded = pd.get_dummies(df, columns=categorical_less_than_5, drop_first=True)

# Apply Label Encoding for features with more than 5 categories
label_encoder = LabelEncoder()
for col in categorical_more_than_5:
    df_encoded[col] = label_encoder.fit_transform(df[col])

print("\nDataset after encoding:")
print(df_encoded.head())
print("\nNew shape:", df_encoded.shape)

Categorical features with <5 categories: ['sex']
Categorical features with >=5 categories: ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'native_country']

Dataset after encoding:
   age  workclass  fnlwgt  education  education_num  marital_status  \
0   39          7   77516          9             13               4   
1   50          6   83311          9             13               2   
2   38          4  215646         11              9               0   
3   53          4  234721          1              7               2   
4   28          4  338409          9             13               2   

   occupation  relationship  race  capital_gain  capital_loss  hours_per_week  \
0           1             1     4          2174             0              40   
1           4             0     4             0             0              13   
2           6             1     4             0             0              40   
3           6             0     

In [10]:
# Create new features
# 1. Age group feature
df['age_group'] = pd.cut(df['age'], bins=[0, 25, 35, 45, 55, 65, 100],
                        labels=['18-25', '26-35', '36-45', '46-55', '56-65', '65+'])

# 2. Work hours category
df['work_hours_category'] = pd.cut(df['hours_per_week'], bins=[0, 30, 40, 50, 100],
                                  labels=['Part-time', 'Full-time', 'Overtime', 'Excessive'])

# 3. Capital transactions (combining gain and loss)
df['capital_net'] = df['capital_gain'] - df['capital_loss']

# Check skewness of numerical features
print("Skewness of numerical features:")
for col in numerical_features:
    skewness = df[col].skew()
    print(f"{col}: {skewness:.2f}")

# Apply log transformation to highly skewed features
df['log_capital_gain'] = np.log1p(df['capital_gain'])  # log(1+x) to handle zeros
df['log_capital_loss'] = np.log1p(df['capital_loss'])

print("\nSkewness after transformation:")
print("capital_gain:", df['log_capital_gain'].skew())
print("capital_loss:", df['log_capital_loss'].skew())

Skewness of numerical features:
age: 0.56
fnlwgt: 1.45
education_num: -0.31
capital_gain: 11.95
capital_loss: 4.59
hours_per_week: 0.23

Skewness after transformation:
capital_gain: 3.096143524467517
capital_loss: 4.307536865725205


In [11]:
def preprocess_adult_data(df):
    # Copy dataframe
    df_processed = df.copy()

    # Handle missing values
    for col in ['workclass', 'occupation', 'native_country']:
        mode_value = df_processed[col][df_processed[col] != '?'].mode()[0]
        df_processed[col] = df_processed[col].replace('?', mode_value)

    # Feature engineering
    df_processed['age_group'] = pd.cut(df_processed['age'], bins=[0, 25, 35, 45, 55, 65, 100],
                                     labels=['18-25', '26-35', '36-45', '46-55', '56-65', '65+'])
    df_processed['work_hours_category'] = pd.cut(df_processed['hours_per_week'], bins=[0, 30, 40, 50, 100],
                                               labels=['Part-time', 'Full-time', 'Overtime', 'Excessive'])
    df_processed['capital_net'] = df_processed['capital_gain'] - df_processed['capital_loss']
    df_processed['log_capital_gain'] = np.log1p(df_processed['capital_gain'])
    df_processed['log_capital_loss'] = np.log1p(df_processed['capital_loss'])

    # Encode target variable
    df_processed['income'] = df_processed['income'].map({'<=50K': 0, '>50K': 1})

    return df_processed

# Apply complete preprocessing
df_final = preprocess_adult_data(df)
print("Final dataset shape:", df_final.shape)
print("\nFinal dataset info:")
print(df_final.info())

Final dataset shape: (32561, 20)

Final dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   age                  32561 non-null  int64   
 1   workclass            32561 non-null  object  
 2   fnlwgt               32561 non-null  int64   
 3   education            32561 non-null  object  
 4   education_num        32561 non-null  int64   
 5   marital_status       32561 non-null  object  
 6   occupation           32561 non-null  object  
 7   relationship         32561 non-null  object  
 8   race                 32561 non-null  object  
 9   sex                  32561 non-null  object  
 10  capital_gain         32561 non-null  int64   
 11  capital_loss         32561 non-null  int64   
 12  hours_per_week       32561 non-null  int64   
 13  native_country       32561 non-null  object  
 14  income          