In [None]:
import pandas as pd

# Load the datasets
try:
    housing_df = pd.read_csv('/content/housing.csv')
    advertising_df = pd.read_csv('/content/advertising.csv')
    faa_ai_prelim_df = pd.read_csv('/content/faa_ai_prelim.csv')
    print("Datasets loaded successfully.")
except FileNotFoundError:
    print("Make sure the datasets are in the correct directory.")

Datasets loaded successfully.


In [None]:
import numpy as np

def calculate_entropy(data):# data -> column
    """
    Calculates the entropy of a given pandas Series.
    """
    if data.isnull().all():
        return 0.0  # Handle cases with all NaN values

    value_counts = data.value_counts(normalize=True)
    entropy = -np.sum(value_counts * np.log2(value_counts + 1e-9))  # Add epsilon to avoid log(0)
    return entropy

In [None]:
# Identify continuous and categorical columns
def identify_column_types(df):
    continuous_cols = df.select_dtypes(include=np.number).columns.tolist()
    categorical_cols = df.select_dtypes(include='object').columns.tolist()
    return continuous_cols, categorical_cols

# Calculate entropy for each column
entropy_results = {}

for df_name, df in zip(['housing', 'advertising', 'faa_ai_prelim'], [housing_df, advertising_df, faa_ai_prelim_df]):
    entropy_results[df_name] = {}
    continuous_cols, categorical_cols = identify_column_types(df)

    for col in categorical_cols:
        entropy_results[df_name][col] = calculate_entropy(df[col])

    for col in continuous_cols:
        # Bin continuous data (using quantiles for simplicity)
        if not df[col].isnull().all():
            try:
                binned_data = pd.qcut(df[col], q=10, labels=False, duplicates='drop')
                entropy_results[df_name][col] = calculate_entropy(binned_data)
            except ValueError:
                # Handle cases where quantiles cannot be computed (e.g., all values are the same)
                entropy_results[df_name][col] = 0.0 # Set entropy to 0 if binning is not possible
        else:
            entropy_results[df_name][col] = 0.0 # Handle columns with all NaN values

In [None]:
# Present results
for df_name, results in entropy_results.items():
    print(f"Entropy results for {df_name} dataset:")
    for col, entropy in results.items():
        print(f"  {col}: {entropy:.4f}")
    print("-" * 30)

Entropy results for housing dataset:
  mainroad: 0.5876
  guestroom: 0.6756
  basement: 0.9345
  hotwaterheating: 0.2686
  airconditioning: 0.8995
  prefarea: 0.7864
  furnishingstatus: 1.5573
  price: 3.3202
  area: 3.3153
  bedrooms: 1.5364
  bathrooms: 0.1425
  stories: 0.7485
  parking: 0.8638
------------------------------
Entropy results for advertising dataset:
  TV: 3.3219
  Radio: 3.3212
  Newspaper: 3.3219
  Sales: 3.3205
------------------------------
Entropy results for faa_ai_prelim dataset:
  UPDATED: 0.2243
  ENTRY_DATE: 2.7897
  EVENT_LCL_DATE: 3.8502
  EVENT_LCL_TIME: 6.1973
  LOC_CITY_NAME: 6.2787
  LOC_STATE_NAME: 4.3330
  LOC_CNTRY_NAME: -0.0000
  RMK_TEXT: 6.3268
  EVENT_TYPE_DESC: 0.9101
  FSDO_DESC: 5.2054
  REGIST_NBR: 6.3750
  FLT_NBR: 2.5850
  ACFT_OPRTR: 2.5216
  ACFT_MAKE_NAME: 3.8938
  ACFT_MODEL_NAME: 5.5629
  ACFT_DMG_DESC: 1.8199
  FLT_ACTIVITY: 2.0382
  FLT_PHASE: 1.8600
  MAX_INJ_LVL: 1.8474
  FATAL_FLAG: -0.0000
  ACFT_MISSING_FLAG: 0.0000
  FAR_PART: