In [3]:
import pandas as pd

file_path = '/content/adult_with_headers.csv'
df = pd.read_csv(file_path)

print(df.head())
print(df.describe(include='all'))
print(df.isnull().sum())
print(df.dtypes)


   age          workclass  fnlwgt   education  education_num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital_status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital_gain  capital_loss  hours_per_week  native_country  income  
0          2174             0              40   United-States   <=50

In [4]:

df = df.fillna(df.median(numeric_only=True))
# Check for missing values
print(df.isnull().sum())


age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64


In [5]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Identify numerical features
numerical_features = df.select_dtypes(include=['int64', 'float64']).columns

# Standard Scaling
standard_scaler = StandardScaler()
df_standard_scaled = df.copy()
df_standard_scaled[numerical_features] = standard_scaler.fit_transform(df[numerical_features])

# Min-Max Scaling
minmax_scaler = MinMaxScaler()
df_minmax_scaled = df.copy()
df_minmax_scaled[numerical_features] = minmax_scaler.fit_transform(df[numerical_features])

# Display scaled data
print(df_standard_scaled.head())
print(df_minmax_scaled.head())

scaling_discussion = """
Standard Scaling is preferred when the data follows a Gaussian distribution, as it standardizes features by removing the mean and scaling to unit variance.
Min-Max Scaling is preferred when the data does not follow a Gaussian distribution, as it scales the data to a fixed range, typically [0, 1], which is useful for algorithms that do not assume any distribution.
"""
print(scaling_discussion)


        age          workclass    fnlwgt   education  education_num  \
0  0.030671          State-gov -1.063611   Bachelors       1.134739   
1  0.837109   Self-emp-not-inc -1.008707   Bachelors       1.134739   
2 -0.042642            Private  0.245079     HS-grad      -0.420060   
3  1.057047            Private  0.425801        11th      -1.197459   
4 -0.775768            Private  1.408176   Bachelors       1.134739   

        marital_status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital_gain  capital_loss  hours_per_week  native_country  income  
0      0.148453      -0.21

In [6]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# Identify categorical features
categorical_features = df.select_dtypes(include=['object']).columns

# Apply One-Hot Encoding
onehot_encoder = OneHotEncoder(sparse_output=False, drop='first')
df_onehot_encoded = df.copy()
for col in categorical_features:
    if df[col].nunique() < 5:
        onehot_encoded = pd.DataFrame(onehot_encoder.fit_transform(df[[col]]), columns=onehot_encoder.get_feature_names_out([col]))
        df_onehot_encoded = pd.concat([df_onehot_encoded, onehot_encoded], axis=1).drop(col, axis=1)

# Apply Label Encoding
label_encoder = LabelEncoder()
df_label_encoded = df.copy()
for col in categorical_features:
    if df[col].nunique() >= 5:
        df_label_encoded[col] = label_encoder.fit_transform(df[col])

# Display encoded data
print(df_onehot_encoded.head())
print(df_label_encoded.head())

# Discuss pros and cons of One-Hot Encoding and Label Encoding
encoding_discussion = """
Pros of One-Hot Encoding:
- Prevents ordinality in categorical variables, making it suitable for non-ordinal data.
- Useful for algorithms that can handle high-dimensional data.

Cons of One-Hot Encoding:
- Can lead to high-dimensional data, causing the "curse of dimensionality".

Pros of Label Encoding:
- Efficient for algorithms that can handle ordinal relationships and large datasets.
- Reduces dimensionality compared to One-Hot Encoding.

Cons of Label Encoding:
- Can introduce ordinality where none exists, potentially misleading algorithms.
"""
print(encoding_discussion)


   age          workclass  fnlwgt   education  education_num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital_status          occupation    relationship    race  \
0        Never-married        Adm-clerical   Not-in-family   White   
1   Married-civ-spouse     Exec-managerial         Husband   White   
2             Divorced   Handlers-cleaners   Not-in-family   White   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black   
4   Married-civ-spouse      Prof-specialty            Wife   Black   

   capital_gain  capital_loss  hours_per_week  native_country  sex_ Male  \
0          2174             0              40   United-States        1.0   
1             0             0             

In [7]:
# Create new features
df['age_bin'] = pd.cut(df['age'], bins=[0, 30, 60, 90], labels=['Young', 'Middle-Aged', 'Senior'])
df['work_hours_per_week_bin'] = pd.cut(df['hours_per_week'], bins=[0, 20, 40, 60, 80], labels=['Part-Time', 'Full-Time', 'Over-Time', 'Extreme'])

import numpy as np
df['log_capital_gain'] = np.log1p(df['capital_gain'])

print(df.head())

feature_engineering_justification = """
The 'age_bin' feature categorizes individuals into age groups, which may help capture age-related income patterns.
The 'work_hours_per_week_bin' feature categorizes working hours, potentially revealing work-time related income trends.
The log transformation of 'capital-gain' reduces skewness, making the feature more normally distributed and suitable for modeling.
"""
print(feature_engineering_justification)


   age          workclass  fnlwgt   education  education_num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital_status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital_gain  capital_loss  hours_per_week  native_country  income  \
0          2174             0              40   United-States   <=5

In [8]:
from sklearn.ensemble import IsolationForest

# Fit Isolation Forest
iso_forest = IsolationForest(contamination=0.1)
outliers = iso_forest.fit_predict(df[numerical_features])
df_no_outliers = df[outliers == 1]

# Discuss how outliers can affect model performance
outliers_discussion = """
Outliers can significantly impact model performance by skewing the data distribution, leading to biased parameter estimates, and affecting the accuracy of predictions. Removing outliers helps in building a more robust and generalizable model.
"""
print(outliers_discussion)



Outliers can significantly impact model performance by skewing the data distribution, leading to biased parameter estimates, and affecting the accuracy of predictions. Removing outliers helps in building a more robust and generalizable model.



In [1]:
!pip install ppscore



In [9]:
import ppscore as pps

# Compute PPS matrix
pps_matrix = pps.matrix(df).pivot(columns='x', index='y', values='ppscore')

# Display PPS matrix
print(pps_matrix)

# Compare with correlation matrix
correlation_matrix = df.corr()
print(correlation_matrix)

# Discuss findings
pps_vs_corr_discussion = """
The PPS matrix shows predictive power between pairs of features, regardless of linear or non-linear relationships.
The correlation matrix shows linear relationships between features.
PPS provides a more comprehensive understanding of feature relationships, identifying potential predictive features that may not have a strong linear correlation.
"""
print(pps_vs_corr_discussion)




x                                 age       age_bin  capital_gain  \
y                                                                   
age                      1.000000e+00  4.933301e-01      0.003296   
age_bin                  1.000000e+00  1.000000e+00      0.000000   
capital_gain             0.000000e+00  0.000000e+00      1.000000   
capital_loss             0.000000e+00  0.000000e+00      0.000000   
education                5.231462e-02  0.000000e+00      0.000000   
education_num            0.000000e+00  0.000000e+00      0.012555   
fnlwgt                   0.000000e+00  0.000000e+00      0.000000   
hours_per_week           0.000000e+00  0.000000e+00      0.000000   
income                   5.415335e-03  0.000000e+00      0.297123   
log_capital_gain         0.000000e+00  0.000000e+00      0.998032   
marital_status           2.981185e-01  3.056148e-01      0.000000   
native_country           0.000000e+00  0.000000e+00      0.000000   
occupation               2.700685e

  correlation_matrix = df.corr()
