In [6]:
import pandas as pd

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Read the CSV file into a DataFrame
df = pd.read_csv('D:/Data science ass/12/EDA2/adult_with_headers.csv')

# Display the first 5 rows
print(df.head().to_markdown(index=False, numalign="left", stralign="left"))

# Print the column names and their data types
print(df.info())

# Print descriptive statistics for numerical columns
print("Descriptive statistics for numerical columns: \n")
print(df.describe(include=[int, float]).to_markdown(numalign="left", stralign="left"))

# Print descriptive statistics for non-numerical columns
print("\nDescriptive statistics for non-numerical columns: \n")
print(df.describe(include=[object]).to_markdown(numalign="left", stralign="left"))

# Check for missing values
print("\nMissing values: \n")
print(df.isnull().sum().to_markdown(numalign="left", stralign="left"))

| age   | workclass        | fnlwgt   | education   | education_num   | marital_status     | occupation        | relationship   | race   | sex    | capital_gain   | capital_loss   | hours_per_week   | native_country   | income   |
|:------|:-----------------|:---------|:------------|:----------------|:-------------------|:------------------|:---------------|:-------|:-------|:---------------|:---------------|:-----------------|:-----------------|:---------|
| 39    | State-gov        | 77516    | Bachelors   | 13              | Never-married      | Adm-clerical      | Not-in-family  | White  | Male   | 2174           | 0              | 40               | United-States    | <=50K    |
| 50    | Self-emp-not-inc | 83311    | Bachelors   | 13              | Married-civ-spouse | Exec-managerial   | Husband        | White  | Male   | 0              | 0              | 13               | United-States    | <=50K    |
| 38    | Private          | 215646   | HS-grad     | 9               | Divo

In [7]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Identify numerical columns
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Initialize scalers
standard_scaler = StandardScaler()
minmax_scaler = MinMaxScaler()

# Apply standard scaling
df_scaled_standard = df.copy()
df_scaled_standard[numerical_cols] = standard_scaler.fit_transform(df_scaled_standard[numerical_cols])

# Apply min-max scaling
df_scaled_minmax = df.copy()
df_scaled_minmax[numerical_cols] = minmax_scaler.fit_transform(df_scaled_minmax[numerical_cols])

# Display the first 5 rows of the scaled DataFrames
print("\nFirst 5 rows after standard scaling: \n")
print(df_scaled_standard.head().to_markdown(index=False, numalign="left", stralign="left"))

print("\nFirst 5 rows after min-max scaling: \n")
print(df_scaled_minmax.head().to_markdown(index=False, numalign="left", stralign="left"))


First 5 rows after standard scaling: 

| age       | workclass        | fnlwgt   | education   | education_num   | marital_status     | occupation        | relationship   | race   | sex    | capital_gain   | capital_loss   | hours_per_week   | native_country   | income   |
|:----------|:-----------------|:---------|:------------|:----------------|:-------------------|:------------------|:---------------|:-------|:-------|:---------------|:---------------|:-----------------|:-----------------|:---------|
| 0.0306706 | State-gov        | -1.06361 | Bachelors   | 1.13474         | Never-married      | Adm-clerical      | Not-in-family  | White  | Male   | 0.148453       | -0.21666       | -0.0354294       | United-States    | <=50K    |
| 0.837109  | Self-emp-not-inc | -1.00871 | Bachelors   | 1.13474         | Married-civ-spouse | Exec-managerial   | Husband        | White  | Male   | -0.14592       | -0.21666       | -2.22215         | United-States    | <=50K    |
| -0.042642 | Privat

The choice between these techniques depends on the specific characteristics of your data and the requirements of your machine learning algorithm. If your data is normally distributed or your algorithm is sensitive to feature scales, StandardScaler is often a good choice. If you need to control the range of your features or want to reduce the impact of outliers, MinMaxScaler might be more suitable. It's often a good practice to experiment with both techniques and evaluate their impact on your model's performance.

In [8]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# Get all categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

# Lists to store low and high cardinality columns
low_cardinality_cols = []
high_cardinality_cols = []

# Categorize columns based on cardinality
for col in categorical_cols:
    if df[col].nunique() < 5:
        low_cardinality_cols.append(col)
    else:
        high_cardinality_cols.append(col)

# One-hot encode low cardinality columns
encoder_ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded_cols = encoder_ohe.fit_transform(df[low_cardinality_cols])
encoded_col_names = encoder_ohe.get_feature_names_out(low_cardinality_cols)
df_encoded = pd.DataFrame(encoded_cols, columns=encoded_col_names)

# Label encode high cardinality columns
encoder_le = LabelEncoder()
for col in high_cardinality_cols:
    df_encoded[col] = encoder_le.fit_transform(df[col])

# Add numerical columns to the encoded DataFrame
df_encoded[numerical_cols] = df[numerical_cols]

# Display the first 5 rows
print(df_encoded.head().to_markdown(index=False, numalign="left", stralign="left"))

| sex_ Female   | sex_ Male   | income_ <=50K   | income_ >50K   | workclass   | education   | marital_status   | occupation   | relationship   | race   | native_country   | age   | fnlwgt   | education_num   | capital_gain   | capital_loss   | hours_per_week   |
|:--------------|:------------|:----------------|:---------------|:------------|:------------|:-----------------|:-------------|:---------------|:-------|:-----------------|:------|:---------|:----------------|:---------------|:---------------|:-----------------|
| 0             | 1           | 1               | 0              | 7           | 9           | 4                | 1            | 1              | 4      | 39               | 39    | 77516    | 13              | 2174           | 0              | 40               |
| 0             | 1           | 1               | 0              | 6           | 9           | 2                | 4            | 0              | 4      | 39               | 50    | 83311    | 13             

One-Hot Encoding: This technique creates a new binary column for each unique value in a categorical feature. It avoids imposing any ordinal relationship between categories, which is crucial for algorithms that don't assume such relationships (e.g., decision trees, random forests). However, it can significantly increase the dimensionality of the dataset, especially for features with many unique values

Label Encoding: This technique assigns a unique numerical label to each category in a feature. It's simple and efficient, especially for features with many categories. However, it can introduce an artificial ordinal relationship between categories, which might mislead algorithms that interpret numerical values as having an order

In [9]:
import numpy as np

# Create new feature 'total_income'
df_encoded['total_income'] = df_encoded['capital_gain'] + df_encoded['capital_loss']

# Create new feature 'financial_dependency'
df_encoded['financial_dependency'] = np.where(df_encoded['relationship'].isin(['Husband', 'Wife']), 0, 1)

# Check skewness of numerical columns
skewness = df_encoded.select_dtypes(include=np.number).skew()
skewed_cols = skewness[(skewness > 1) | (skewness < -1)].index.tolist()
print(f"Skewed columns: {skewed_cols}")

# Apply log transformation to skewed columns
for col in skewed_cols:
    df_encoded[col + '_log'] = np.log1p(df_encoded[col])  # Use log1p to handle 0 values

# Display the first 5 rows
print(df_encoded.head().to_markdown(index=False, numalign="left", stralign="left"))

Skewed columns: ['income_ <=50K', 'income_ >50K', 'race', 'native_country', 'fnlwgt', 'capital_gain', 'capital_loss', 'total_income']
| sex_ Female   | sex_ Male   | income_ <=50K   | income_ >50K   | workclass   | education   | marital_status   | occupation   | relationship   | race   | native_country   | age   | fnlwgt   | education_num   | capital_gain   | capital_loss   | hours_per_week   | total_income   | financial_dependency   | income_ <=50K_log   | income_ >50K_log   | race_log   | native_country_log   | fnlwgt_log   | capital_gain_log   | capital_loss_log   | total_income_log   |
|:--------------|:------------|:----------------|:---------------|:------------|:------------|:-----------------|:-------------|:---------------|:-------|:-----------------|:------|:---------|:----------------|:---------------|:---------------|:-----------------|:---------------|:-----------------------|:--------------------|:-------------------|:-----------|:---------------------|:-------------|:---

Two new features have been created:

'total_income': This feature combines 'capital_gain' and 'capital_loss' to provide a more comprehensive view of an individual's financial situation. It might be more informative than considering these gains and losses separately, as it reflects the net financial impact.

'financial_dependency': This feature indicates whether an individual is likely to be financially dependent on their spouse, based on their relationship status. It could be relevant for predicting income, as financial dependents might have different income patterns compared to those who are financially independ

Additionally, log transformation has been applied to the following skewed features:

'fnlwgt'
'capital_gain'
'capital_loss'
'total_income'
'income_ <=50K'
'income_ >50K'
'race'
'native_country'
These features exhibited high skewness, which can negatively impact the performance of many machine learning algorithms. Log transformation compresses the range of these features, reducing the influence of extreme values and making their distributions more symmetrical. This can improve the model's ability to learn patterns and make accurate p.redictionsent.

In [11]:
from sklearn.ensemble import IsolationForest

# Copy the DataFrame to avoid modifying the original
df_no_outliers = df_encoded.copy()

# Initialize and fit Isolation Forest
clf = IsolationForest(contamination=0.05, random_state=42)  # Adjust contamination as needed
clf.fit(df_no_outliers[numerical_cols + ['total_income_log']])  # Include the log-transformed feature

# Predict outliers
df_no_outliers['outlier'] = clf.predict(df_no_outliers[numerical_cols + ['total_income_log']])

# Remove outliers
print(f"Shape before removing outliers: {df_no_outliers.shape}")
df_no_outliers = df_no_outliers[df_no_outliers['outlier'] != -1]
print(f"Shape after removing outliers: {df_no_outliers.shape}")

# Calculate PPS matrix
from ppscore import matrix

pps_matrix = matrix(df_no_outliers)

# Calculate correlation matrix
corr_matrix = df_no_outliers.corr()

# Print the matrices
print("\nPPS Matrix: \n")
print(pps_matrix.round(2).to_markdown(numalign="left", stralign="left"))

print("\nCorrelation Matrix: \n")
print(corr_matrix.round(2).to_markdown(numalign="left", stralign="left"))



Shape before removing outliers: (32561, 28)
Shape after removing outliers: (30933, 28)

PPS Matrix: 

|     | x                    | y                    | ppscore   | case               | is_valid_score   | metric              | baseline_score   | model_score   | model                   |
|:----|:---------------------|:---------------------|:----------|:-------------------|:-----------------|:--------------------|:-----------------|:--------------|:------------------------|
| 0   | sex_ Female          | sex_ Female          | 1         | predict_itself     | True             |                     | 0                | 1             |                         |
| 1   | sex_ Female          | sex_ Male            | 1         | regression         | True             | mean absolute error | 0.34             | 0             | DecisionTreeRegressor() |
| 2   | sex_ Female          | income_ <=50K        | 0         | regression         | True             | mean absolute error | 0.22          

In [13]:
# Get correlations with the target variable
target_correlations = corr_matrix['income_ >50K'].sort_values(ascending=False)

# Print top 10 positive correlations
print("\nTop 10 Positive Correlations with Target:\n")
print(target_correlations[:11].to_markdown(numalign="left", stralign="left"))  # Include target itself

# Print top 10 negative correlations
print("\nTop 10 Negative Correlations with Target:\n")
print(target_correlations[-11:].to_markdown(numalign="left", stralign="left"))  # Include target itself


Top 10 Positive Correlations with Target:

|                  | income_ >50K   |
|:-----------------|:---------------|
| income_ >50K_log | 1              |
| income_ >50K     | 1              |
| education_num    | 0.306751       |
| total_income     | 0.298815       |
| capital_gain     | 0.286913       |
| total_income_log | 0.246479       |
| capital_gain_log | 0.227525       |
| age              | 0.221597       |
| hours_per_week   | 0.213549       |
| sex_ Male        | 0.20873        |
| capital_loss     | 0.0964537      |

Top 10 Negative Correlations with Target:

|                      | income_ >50K   |
|:---------------------|:---------------|
| native_country       | 0.0206718      |
| native_country_log   | 0.0110037      |
| fnlwgt_log           | -0.00247703    |
| fnlwgt               | -0.0158429     |
| marital_status       | -0.195166      |
| sex_ Female          | -0.20873       |
| relationship         | -0.244893      |
| income_ <=50K_log    | -1             