### Question:
How would you design a Python class for managing and analyzing a dataset in a way that supports both exploratory data analysis (EDA) and predictive modeling? The class should include methods for:

1. Automatically handling missing values.
2. Performing basic descriptive statistics.
3. Encoding categorical variables for machine learning models.
4. Saving and loading the dataset efficiently.

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [2]:
class DatasetManager:
    def __init__(self, data: pd.DataFrame):
        """
        Initialize the DatasetManager with a given pandas DataFrame.
        :param data: Input DataFrame to manage.
        """
        if not isinstance(data, pd.DataFrame):
            raise ValueError("Input data must be a pandas DataFrame.")
        self.dataset = data.copy()
    
    def handle_missing_values(self, strategy="mean", fill_value=None):
        """
        Handle missing values in the dataset.
        :param strategy: Strategy for imputation ('mean', 'median', 'mode', or 'constant').
        :param fill_value: Value to use when strategy is 'constant'.
        """
        if strategy == "mean":
            self.dataset.fillna(self.dataset.mean(numeric_only=True), inplace=True)
        elif strategy == "median":
            self.dataset.fillna(self.dataset.median(numeric_only=True), inplace=True)
        elif strategy == "mode":
            self.dataset.fillna(self.dataset.mode().iloc[0], inplace=True)
        elif strategy == "constant":
            if fill_value is None:
                raise ValueError("fill_value must be provided when using 'constant' strategy.")
            self.dataset.fillna(fill_value, inplace=True)
        else:
            raise ValueError(f"Unknown strategy: {strategy}")
        print("Missing values handled.")
    
    def get_descriptive_stats(self):
        """
        Return descriptive statistics for numerical columns.
        """
        return self.dataset.describe()
    
    def encode_categorical(self, columns, method="label"):
        """
        Encode categorical variables.
        :param columns: List of columns to encode.
        :param method: Encoding method ('label' or 'onehot').
        """
        if method == "label":
            for col in columns:
                le = LabelEncoder()
                self.dataset[col] = le.fit_transform(self.dataset[col])
        elif method == "onehot":
            self.dataset = pd.get_dummies(self.dataset, columns=columns)
        else:
            raise ValueError(f"Unknown encoding method: {method}")
        print(f"Categorical columns {columns} encoded using {method} encoding.")
    
    def add_column(self, column_name, values):
        """
        Add a new column to the dataset.
        :param column_name: Name of the new column.
        :param values: Values to populate the column.
        """
        self.dataset[column_name] = values
        print(f"Column '{column_name}' added.")
    
    def get_data(self):
        """
        Return the current state of the dataset.
        """
        return self.dataset

In [3]:
# Example Usage:
data = pd.DataFrame({
    "Name": ["Alice", "Bob", "Charlie", None],
    "Age": [25, 30, None, 40],
    "Salary": [50000, 60000, 70000, None],
    "Gender": ["F", "M", "M", "F"]
})

In [4]:
dm = DatasetManager(data)

In [5]:
# Handle missing values
dm.handle_missing_values(strategy="mean")


Missing values handled.


In [6]:
# Get descriptive statistics
print(dm.get_descriptive_stats())

             Age        Salary
count   4.000000      4.000000
mean   31.666667  60000.000000
std     6.236096   8164.965809
min    25.000000  50000.000000
25%    28.750000  57500.000000
50%    30.833333  60000.000000
75%    33.750000  62500.000000
max    40.000000  70000.000000


In [7]:
# Encode categorical variables
dm.encode_categorical(columns=["Gender"], method="label")

Categorical columns ['Gender'] encoded using label encoding.


In [8]:
# Add a new column
dm.add_column("Experience", [2, 5, 7, 3])

Column 'Experience' added.


In [9]:
# Get final dataset
print(dm.get_data())

      Name        Age   Salary  Gender  Experience
0    Alice  25.000000  50000.0       0           2
1      Bob  30.000000  60000.0       1           5
2  Charlie  31.666667  70000.0       1           7
3     None  40.000000  60000.0       0           3
