In [1]:
from unittest.mock import inplace

from sklearn.datasets import fetch_openml

# Load the dataset
dataset = fetch_openml(data_id=1590, as_frame=True)

# Extract features (X) and target (y)
X = dataset.data
y = dataset.target

# Copy the 'capital-gain' column
X['capital-gain1'] = X['capital-gain']
X['capital-gain2'] = X['capital-gain']
X['capital-gain3'] = X['capital-gain']


# Display first few rows
print(X.head())
print(y.head())


   age  workclass  fnlwgt     education  education-num      marital-status  \
0   25    Private  226802          11th              7       Never-married   
1   38    Private   89814       HS-grad              9  Married-civ-spouse   
2   28  Local-gov  336951    Assoc-acdm             12  Married-civ-spouse   
3   44    Private  160323  Some-college             10  Married-civ-spouse   
4   18        NaN  103497  Some-college             10       Never-married   

          occupation relationship   race     sex  capital-gain  capital-loss  \
0  Machine-op-inspct    Own-child  Black    Male             0             0   
1    Farming-fishing      Husband  White    Male             0             0   
2    Protective-serv      Husband  White    Male             0             0   
3  Machine-op-inspct      Husband  Black    Male          7688             0   
4                NaN    Own-child  White  Female             0             0   

   hours-per-week native-country  capital-gain1  c

In [27]:
import pandas as pd
import numpy as np
from sklearn.metrics import normalized_mutual_info_score
from sklearn.preprocessing import LabelEncoder


def get_mutual_information_score(X):
    features = X.columns
    num_features = len(features)

    # Initialize an empty DataFrame to store mutual information values
    mi_matrix = pd.DataFrame(np.zeros((num_features, num_features)), columns=features, index=features)

    # Preprocess each feature to ensure that all features are discrete (integers)
    processed_X = X.copy()
    encoders = {}

    for feature in features:
        if X[feature].dtype == 'object' or X[feature].dtype.name == 'category':
            # Encode categorical features using LabelEncoder
            le = LabelEncoder()
            processed_X[feature] = le.fit_transform(X[feature].astype(str))  # Encode as integers
            encoders[feature] = le

    # Calculate mutual information for each feature pair
    for i in range(num_features):
        for j in range(i + 1, num_features):  # Avoid redundant calculations (symmetry)
            mi_value = normalized_mutual_info_score(processed_X[features[i]], processed_X[features[j]])
            mi_matrix.iloc[i, j] = mi_value
            mi_matrix.iloc[j, i] = mi_value  # Symmetric assignment


    print(mi_matrix)
    # Step 3: Compute the Average Information Score
    num_comparisons = (num_features * (num_features - 1)) / 2  # Upper triangle count
    average_info_score = mi_matrix.sum().sum() / (2 * num_comparisons)  # Sum divided by num comparisons
    return average_info_score

mutual_information_matrix = get_mutual_information_score(X)


                     age  workclass    fnlwgt  education  education-num  \
age             0.000000   0.023279  0.486762   0.037580       0.037580   
workclass       0.023279   0.000000  0.151751   0.020786       0.020786   
fnlwgt          0.486762   0.151751  0.000000   0.250843       0.250843   
education       0.037580   0.020786  0.250843   0.000000       1.000000   
education-num   0.037580   0.020786  0.250843   1.000000       0.000000   
marital-status  0.094916   0.018233  0.174431   0.014388       0.014388   
occupation      0.025309   0.183297  0.295409   0.099649       0.099649   
relationship    0.073349   0.019048  0.202642   0.021497       0.021497   
race            0.002381   0.008643  0.099934   0.007624       0.007624   
sex             0.003904   0.014204  0.113747   0.003413       0.003413   
capital-gain    0.035031   0.019161  0.092253   0.028032       0.028032   
capital-loss    0.021769   0.012406  0.056562   0.017364       0.017364   
hours-per-week  0.049366 

In [28]:
mutual_information_matrix

0.10344621782809955

In [2]:
df = X
df['target'] = y

In [4]:
condition_1 = df[X.columns.tolist()].duplicated()

In [None]:
df_filtered = df[condition_1]

In [None]:
import dask.dataframe as dd

ddf = dd.from_pandas(df, npartitions=4)
result = ddf[condition_1].groupby(list(X.columns)).size().compute()

  meta = self._meta[_extract_meta(key)]
  self._meta = self.obj._meta.groupby(
