### Feature Selection - Using Mutual Information
**Description**: Use mutual information for feature selection to identify important features.

In [1]:
import pandas as pd
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Create a sample DataFrame for classification
data_classification = {'feature_1': [1, 2, 3, 4, 5, 1, 2, 3, 4, 5],
                       'feature_2': [5, 4, 3, 2, 1, 5, 4, 3, 2, 1],
                       'feature_3': ['A', 'B', 'A', 'C', 'B', 'C', 'A', 'B', 'C', 'A'],
                       'target': ['yes', 'no', 'yes', 'no', 'yes', 'no', 'yes', 'no', 'yes', 'no']}
df_classification = pd.DataFrame(data_classification)

print("Sample DataFrame for Classification:")
print(df_classification)

# Encode the target variable if it's categorical
label_encoder = LabelEncoder()
df_classification['target_encoded'] = label_encoder.fit_transform(df_classification['target'])

X_classification = df_classification.drop(['target', 'target_encoded'], axis=1)
y_classification = df_classification['target_encoded']

# Handle categorical features by one-hot encoding
X_classification = pd.get_dummies(X_classification, drop_first=True)

# Calculate mutual information for classification
mutual_info_class = mutual_info_classif(X_classification, y_classification)

# Create a Series of feature names and their corresponding mutual information scores
mutual_info_series_class = pd.Series(mutual_info_class, index=X_classification.columns)
mutual_info_series_class = mutual_info_series_class.sort_values(ascending=False)

print("\nMutual Information Scores for Classification:")
print(mutual_info_series_class)

# Select the top k features based on mutual information
k_classification = 2  # Select top 2 features
selector_class = SelectKBest(mutual_info_classif, k=k_classification)
X_classification_selected = selector_class.fit_transform(X_classification, y_classification)

# Get the names of the selected features
selected_features_class_indices = selector_class.get_support(indices=True)
selected_features_class = X_classification.columns[selected_features_class_indices]

print(f"\nTop {k_classification} features selected for Classification:")
print(selected_features_class)

print("\n" + "="*50 + "\n")

# Create a sample DataFrame for regression
data_regression = {'feature_1': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                     'feature_2': [2, 4, 1, 5, 3, 7, 6, 9, 8, 10],
                     'feature_3': [10, 8, 12, 6, 14, 4, 16, 2, 18, 1],
                     'target': [5.1, 6.8, 4.2, 7.5, 5.9, 8.1, 7.2, 9.4, 8.5, 3.9]}
df_regression = pd.DataFrame(data_regression)

print("Sample DataFrame for Regression:")
print(df_regression)

X_regression = df_regression.drop('target', axis=1)
y_regression = df_regression['target']

# Calculate mutual information for regression
mutual_info_reg = mutual_info_regression(X_regression, y_regression)

# Create a Series of feature names and their corresponding mutual information scores
mutual_info_series_reg = pd.Series(mutual_info_reg, index=X_regression.columns)
mutual_info_series_reg = mutual_info_series_reg.sort_values(ascending=False)

print("\nMutual Information Scores for Regression:")
print(mutual_info_series_reg)

# Select the top k features based on mutual information
k_regression = 2  # Select top 2 features
selector_reg = SelectKBest(mutual_info_regression, k=k_regression)
X_regression_selected = selector_reg.fit_transform(X_regression, y_regression)

# Get the names of the selected features
selected_features_reg_indices = selector_reg.get_support(indices=True)
selected_features_reg = X_regression.columns[selected_features_reg_indices]

print(f"\nTop {k_regression} features selected for Regression:")
print(selected_features_reg)

Sample DataFrame for Classification:
   feature_1  feature_2 feature_3 target
0          1          5         A    yes
1          2          4         B     no
2          3          3         A    yes
3          4          2         C     no
4          5          1         B    yes
5          1          5         C     no
6          2          4         A    yes
7          3          3         B     no
8          4          2         C    yes
9          5          1         A     no

Mutual Information Scores for Classification:
feature_3_C    0.097063
feature_1      0.000000
feature_2      0.000000
feature_3_B    0.000000
dtype: float64

Top 2 features selected for Classification:
Index(['feature_3_B', 'feature_3_C'], dtype='object')


Sample DataFrame for Regression:
   feature_1  feature_2  feature_3  target
0          1          2         10     5.1
1          2          4          8     6.8
2          3          1         12     4.2
3          4          5          6     7.5
4    