In [18]:
import os
import mimetypes
import pandas as pd
import arff
from pymfe.mfe import MFE
from datetime import datetime
import numpy as np  # For handling NaN values

def extract_meta_features(file_path):
    """Extract meta-features using pymfe for ARFF files."""
    data = arff.load(open(file_path, 'r'))
    cols = [i[0] for i in data["attributes"]]
    df = pd.DataFrame(data["data"], columns=cols)

    # Extract features and labels
    X = df.to_numpy()
    X = X[:, :-2]  # Adjusted to exclude the last two columns
    try:
        y = df["class"].to_numpy()  # Assuming 'class' is the label column
    except:
        y = df["binaryClass"].to_numpy()

    # Extract meta-features
    mfe = MFE(groups="all")
    mfe.fit(X, y)
    ft = mfe.extract()

    # Ensure ft is a list of tuples (feature_name, feature_value)
    meta_features = list(ft[0])  # Extract feature names from the output
    meta_values = list(ft[1])  # Corresponding values

    return meta_features, meta_values

def generate_data_card(file_path):
    # Extract file metadata
    file_name = os.path.basename(file_path)
    file_size = os.path.getsize(file_path)
    file_type, _ = mimetypes.guess_type(file_path)
    creation_time = datetime.fromtimestamp(os.path.getctime(file_path))
    
    # Format size to be human-readable
    def human_readable_size(size, decimal_places=2):
        for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
            if size < 1024:
                return f"{size:.{decimal_places}f} {unit}"
            size /= 1024

    # Pre-filled manual information
    purpose = "For machine learning classification tasks, including fine-tuning models and evaluating logistic regression."
    maintainer = "MLflow/Kedro Pipeline Maintainer"
    license_info = "MIT License"
    description = """This is the large soybean database from the UCI repository, combining training and test data into a single file.

There are 19 classes, but prior studies have only used the first 15. The last four classes are considered unsupported due to the small number of examples. The dataset has 35 categorical attributes, some of which are nominal, while others are ordered. 'dna' denotes 'does not apply,' and unknown values are represented as '?'. Attributes are numerically encoded, with '0' for the first value, '1' for the second, and so on.

Source: UCI - 1988
Please cite: R.S. Michalski and R.L. Chilausky, 'Learning by Being Told and Learning from Examples: An Experimental Comparison of the Two Methods of Knowledge Acquisition in the Context of Developing an Expert System for Soybean Disease Diagnosis,' International Journal of Policy Analysis and Information Systems, Vol. 4, No. 2, 1980.
"""

    # Extract meta-features if file is ARFF
    meta_features, meta_values = extract_meta_features(file_path)
    size_str = f"{len(meta_features)} meta-features extracted"
    
    # Generate the dynamic data card
    data_card = f"""
    ### Data Card for {file_name}

    **1. Dataset Overview**
    - **Name of Dataset**: {file_name}
    - **Date of Dataset Creation**: {creation_time.strftime('%Y-%m-%d')}
    - **Maintainer(s)**: {maintainer}
    - **File Type**: {file_type or 'Unknown'}
    - **Size**: {size_str}
    - **License**: {license_info}
    - **Purpose**: {purpose}

    **2. Clarity (C)**
    - **Description**: {description}
    - **Schema**: 35 categorical attributes (some nominal, some ordered), encoded numerically with unknown values as '?'.
    - **Intended Use**: Machine learning classification tasks, including disease diagnosis modeling for soybeans.
    - **Not Intended for**: Datasets that require numeric data or dense continuous features.

    **3. Limitations (L)**
    - **Known Limitations**: 
      - Only 15 out of 19 classes have been extensively used in previous studies. The last four classes may not be well supported due to the small sample sizes.
      - Certain categorical attributes, such as 'dna' (does not apply), may not generalize to broader contexts.
    - **Data Collection Process**: The dataset was collected in 1988, so there may be limitations in attribute encoding and class definitions relevant to that period.
    - **Legal/Ethical Considerations**: The dataset is publicly available, but its use should comply with data handling policies.

    **4. Evaluation (E)**
    - **Data Quality**: The dataset is well-structured, with categorical attributes labeled clearly. However, some classes have very few examples.
    - **Validation and Testing**: 
      - Commonly used in model fine-tuning tasks like logistic regression and classification.
      - It has been extensively studied, with results available for the first 15 classes.
    - **Performance Benchmarks**: Historical studies used logistic regression models and fine-tuning techniques for classification, with a focus on soybean disease diagnosis.

    **5. Accessibility (A)**
    - **Access**: Publicly accessible through the UCI repository and integrated into MLflow/Kedro pipelines.
    - **Requirements**: Requires Python 3.7+ with relevant libraries such as Pandas, pymfe, and arff for meta-feature extraction.
    - **Format**: {file_type or 'ARFF'}
    - **Size**: {human_readable_size(file_size)}

    **6. Record-Keeping (R)**
    - **Versioning**: Initial version created from the UCI repository (large soybean dataset).
    - **Update Frequency**: Infrequent; updates may only occur if the dataset is modified or new analysis is added.
    - **Provenance**: This dataset has been used in studies such as 'Learning by Being Told and Learning from Examples' for soybean disease diagnosis.
    - **Citations**: {description.split('Please cite: ')[-1]}

    **7. Meta-features (Generated Automatically)**
    """
    
    # Add meta-features line by line for better readability
    for name, value in zip(meta_features, meta_values):
        # Ensure we handle non-iterable values like nan or float64
        if isinstance(value, (float, int)):
            value_str = f"{value:.4f}" if not np.isnan(value) else "NaN"
        else:
            value_str = str(value)

        data_card += f"    - **{name}**: {value_str}\n"

    return data_card

# Example usage
file_path = "../data/01_raw/dataset_42_soybean.arff"
#file_path = "../data/01_raw/soybean.arff"# Update this path as needed
data_card = generate_data_card(file_path)
print(data_card)


TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").
TypeError("'<' not supported between instances of 'NoneType' and 'str'").
TypeError("'<' not supported between instances of 'NoneType' and 'str'").
 Exception message: TypeError("'<' not supported between instances of 'NoneType' and 'str'").
 Exception message: TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").
 Will set it as 'np.nan' for all summary functions.
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
 Exception message: TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").
 Exception message: TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").
 Exception message: TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").
 Exception message: TypeError("OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'").
  np.log(np.linalg.d

  intra_extra[cur_ind:next_ind] = intra / extra



    ### Data Card for dataset_42_soybean.arff

    **1. Dataset Overview**
    - **Name of Dataset**: dataset_42_soybean.arff
    - **Date of Dataset Creation**: 2024-09-25
    - **Maintainer(s)**: MLflow/Kedro Pipeline Maintainer
    - **File Type**: Unknown
    - **Size**: 179 meta-features extracted
    - **License**: MIT License
    - **Purpose**: For machine learning classification tasks, including fine-tuning models and evaluating logistic regression.

    **2. Clarity (C)**
    - **Description**: This is the large soybean database from the UCI repository, combining training and test data into a single file.

There are 19 classes, but prior studies have only used the first 15. The last four classes are considered unsupported due to the small number of examples. The dataset has 35 categorical attributes, some of which are nominal, while others are ordered. 'dna' denotes 'does not apply,' and unknown values are represented as '?'. Attributes are numerically encoded, with '0' for t

 Exception message: TypeError("'<' not supported between instances of 'NoneType' and 'str'").
 Will set it as 'np.nan' for all summary functions.
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
 Exception message: TypeError("'<' not supported between instances of 'NoneType' and 'str'").
 Will set it as 'np.nan' for all summary functions.


### Data Card for soybean.arff

    **1. Dataset Overview**
    - **Name of Dataset**: soybean.arff
    - **Date of Dataset Creation**: 2024-09-25
    - **Maintainer(s)**: MLflow/Kedro Pipeline Maintainer
    - **File Type**: Unknown
    - **Size**: 179 meta-features extracted
    - **License**: MIT License
    - **Purpose**: For machine learning classification tasks, including fine-tuning models and evaluating logistic regression.

    **2. Clarity (C)**
    - **Description**: This is the large soybean database from the UCI repository, combining training and test data into a single file.

There are 19 classes, but prior studies have only used the first 15. The last four classes are considered unsupported due to the small number of examples. The dataset has 35 categorical attributes, some of which are nominal, while others are ordered. 'dna' denotes 'does not apply,' and unknown values are represented as '?'. Attributes are numerically encoded, with '0' for the first value, '1' for the second, and so on.

Source: UCI - 1988
Please cite: R.S. Michalski and R.L. Chilausky, 'Learning by Being Told and Learning from Examples: An Experimental Comparison of the Two Methods of Knowledge Acquisition in the Context of Developing an Expert System for Soybean Disease Diagnosis,' International Journal of Policy Analysis and Information Systems, Vol. 4, No. 2, 1980.

    - **Schema**: 35 categorical attributes (some nominal, some ordered), encoded numerically with unknown values as '?'.
    - **Intended Use**: Machine learning classification tasks, including disease diagnosis modeling for soybeans.
    - **Not Intended for**: Datasets that require numeric data or dense continuous features.

    **3. Limitations (L)**
    - **Known Limitations**: 
      - Only 15 out of 19 classes have been extensively used in previous studies. The last four classes may not be well supported due to the small sample sizes.
      - Certain categorical attributes, such as 'dna' (does not apply), may not generalize to broader contexts.
    - **Data Collection Process**: The dataset was collected in 1988, so there may be limitations in attribute encoding and class definitions relevant to that period.
    - **Legal/Ethical Considerations**: The dataset is publicly available, but its use should comply with data handling policies.

    **4. Evaluation (E)**
    - **Data Quality**: The dataset is well-structured, with categorical attributes labeled clearly. However, some classes have very few examples.
    - **Validation and Testing**: 
      - Commonly used in model fine-tuning tasks like logistic regression and classification.
      - It has been extensively studied, with results available for the first 15 classes.
    - **Performance Benchmarks**: Historical studies used logistic regression models and fine-tuning techniques for classification, with a focus on soybean disease diagnosis.

    **5. Accessibility (A)**
    - **Access**: Publicly accessible through the UCI repository and integrated into MLflow/Kedro pipelines.
    - **Requirements**: Requires Python 3.7+ with relevant libraries such as Pandas, pymfe, and arff for meta-feature extraction.
    - **Format**: ARFF
    - **Size**: 160.59 KB

    **6. Record-Keeping (R)**
    - **Versioning**: Initial version created from the UCI repository (large soybean dataset).
    - **Update Frequency**: Infrequent; updates may only occur if the dataset is modified or new analysis is added.
    - **Provenance**: This dataset has been used in studies such as 'Learning by Being Told and Learning from Examples' for soybean disease diagnosis.
    - **Citations**: R.S. Michalski and R.L. Chilausky, 'Learning by Being Told and Learning from Examples: An Experimental Comparison of the Two Methods of Knowledge Acquisition in the Context of Developing an Expert System for Soybean Disease Diagnosis,' International Journal of Policy Analysis and Information Systems, Vol. 4, No. 2, 1980.


    **7. Meta-features (Generated Automatically)**
        - **attr_conc.mean**: 0.0912
    - **attr_conc.sd**: 0.1392
    - **attr_ent.mean**: NaN
    - **attr_ent.sd**: NaN
    - **attr_to_inst**: 0.0498
    - **best_node.mean**: 0.8462
    - **best_node.mean.relative**: 2.0000
    - **best_node.sd**: 0.0599
    - **best_node.sd.relative**: 5.0000
    - **c1**: 0.5702
    - **c2**: 0.6960
    - **can_cor.mean**: NaN
    - **can_cor.sd**: NaN
    - **cat_to_num**: NaN
    - **ch**: 27.3375
    - **class_conc.mean**: 0.0339
    - **class_conc.sd**: 0.0344
    - **class_ent**: 0.5702
    - **cls_coef**: 0.4217
    - **cohesiveness.mean**: 18.2259
    - **cohesiveness.sd**: 5.3743
    - **conceptvar.mean**: 0.2315
    - **conceptvar.sd**: 0.2206
    - **cor.mean**: 0.1415
    - **cor.sd**: 0.1477
    - **cov.mean**: 0.0213
    - **cov.sd**: 0.0290
    - **density**: 0.8404
    - **eigenvalues.mean**: 0.1461
    - **eigenvalues.sd**: 0.3605
    - **elite_nn.mean**: 0.7582
    - **elite_nn.mean.relative**: 1.0000
    - **elite_nn.sd**: 0.2305
    - **elite_nn.sd.relative**: 7.0000
    - **eq_num_attr**: NaN
    - **f1.mean**: 0.9696
    - **f1.sd**: 0.0359
    - **f1v.mean**: 0.0379
    - **f1v.sd**: NaN
    - **f2.mean**: 0.0000
    - **f2.sd**: NaN
    - **f3.mean**: 0.4788
    - **f3.sd**: NaN
    - **f4.mean**: 0.0908
    - **f4.sd**: NaN
    - **freq_class.mean**: 0.5000
    - **freq_class.sd**: 0.5166
    - **g_mean.mean**: 0.0000
    - **g_mean.sd**: 0.0000
    - **gravity**: 2.1536
    - **h_mean.mean**: 0.0000
    - **h_mean.sd**: 0.0000
    - **hubs.mean**: 0.6728
    - **hubs.sd**: 0.2443
    - **impconceptvar.mean**: 4.4688
    - **impconceptvar.sd**: 5.1014
    - **inst_to_attr**: 20.0882
    - **int**: 5.0478
    - **iq_range.mean**: 0.3646
    - **iq_range.sd**: 0.4838
    - **joint_ent.mean**: 1.5603
    - **joint_ent.sd**: 0.5544
    - **kurtosis.mean**: 12.5895
    - **kurtosis.sd**: 69.9188
    - **l1.mean**: 0.0016
    - **l1.sd**: NaN
    - **l2.mean**: 0.0029
    - **l2.sd**: NaN
    - **l3.mean**: 0.0234
    - **l3.sd**: NaN
    - **leaves**: 26
    - **leaves_branch.mean**: 6.7308
    - **leaves_branch.sd**: 2.3758
    - **leaves_corrob.mean**: 0.0385
    - **leaves_corrob.sd**: 0.1113
    - **leaves_homo.mean**: 840.0317
    - **leaves_homo.sd**: 915.6415
    - **leaves_per_class.mean**: 0.5000
    - **leaves_per_class.sd**: 0.1088
    - **lh_trace**: NaN
    - **linear_discr.mean**: 0.9751
    - **linear_discr.mean.relative**: 7.0000
    - **linear_discr.sd**: 0.0240
    - **linear_discr.sd.relative**: 3.0000
    - **lsc**: 0.9167
    - **mad.mean**: 0.0000
    - **mad.sd**: 0.0000
    - **max.mean**: 1.0000
    - **max.sd**: 0.0000
    - **mean.mean**: 0.3082
    - **mean.sd**: 0.2608
    - **median.mean**: 0.2604
    - **median.sd**: 0.4412
    - **min.mean**: 0.0000
    - **min.sd**: 0.0000
    - **mut_inf.mean**: NaN
    - **mut_inf.sd**: NaN
    - **n1**: 0.0776
    - **n2.mean**: 0.1898
    - **n2.sd**: 0.1455
    - **n3.mean**: 0.0293
    - **n3.sd**: 0.1687
    - **n4.mean**: 0.2533
    - **n4.sd**: 0.4352
    - **naive_bayes.mean**: 0.8634
    - **naive_bayes.mean.relative**: 3.0000
    - **naive_bayes.sd**: 0.2045
    - **naive_bayes.sd.relative**: 6.0000
    - **nodes**: 25
    - **nodes_per_attr**: 0.2604
    - **nodes_per_inst**: 0.0366
    - **nodes_per_level.mean**: 2.5000
    - **nodes_per_level.sd**: 1.1785
    - **nodes_repeated.mean**: 1.1905
    - **nodes_repeated.sd**: 0.5118
    - **nr_attr**: 34.0000
    - **nr_bin**: NaN
    - **nr_cat**: 34.0000
    - **nr_class**: 2.0000
    - **nr_cor_attr**: 0.0388
    - **nr_disc**: NaN
    - **nr_inst**: 683.0000
    - **nr_norm**: 0.0000
    - **nr_num**: 0.0000
    - **nr_outliers**: 61
    - **nre**: 0.3952
    - **ns_ratio**: NaN
    - **num_to_cat**: 0.0000
    - **one_itemset.mean**: NaN
    - **one_itemset.sd**: NaN
    - **one_nn.mean**: 0.9545
    - **one_nn.mean.relative**: 6.0000
    - **one_nn.sd**: 0.0343
    - **one_nn.sd.relative**: 4.0000
    - **p_trace**: NaN
    - **pb**: 0.0952
    - **random_node.mean**: 0.8653
    - **random_node.mean.relative**: 4.5000
    - **random_node.sd**: 0.0054
    - **random_node.sd.relative**: 1.5000
    - **range.mean**: 1.0000
    - **range.sd**: 0.0000
    - **roy_root**: NaN
    - **sc**: 0
    - **sd.mean**: 0.3651
    - **sd.sd**: 0.1139
    - **sd_ratio**: NaN
    - **sil**: -0.0105
    - **skewness.mean**: 1.8104
    - **skewness.sd**: 3.3782
    - **sparsity.mean**: NaN
    - **sparsity.sd**: NaN
    - **t1**: 0.1303
    - **t2**: 0.1406
    - **t3**: 0.0542
    - **t4**: 0.3854
    - **t_mean.mean**: 0.2607
    - **t_mean.sd**: 0.3487
    - **tree_depth.mean**: 5.8824
    - **tree_depth.sd**: 2.5428
    - **tree_imbalance.mean**: 0.0901
    - **tree_imbalance.sd**: 0.1142
    - **tree_shape.mean**: 0.1076
    - **tree_shape.sd**: 0.1229
    - **two_itemset.mean**: NaN
    - **two_itemset.sd**: NaN
    - **var.mean**: 0.1461
    - **var.sd**: 0.0756
    - **var_importance.mean**: 0.0104
    - **var_importance.sd**: 0.0356
    - **vdb**: 2.9833
    - **vdu**: 0.0000
    - **w_lambda**: NaN
    - **wg_dist.mean**: 4.7267
    - **wg_dist.sd**: 0.4153
    - **worst_node.mean**: 0.8653
    - **worst_node.mean.relative**: 4.5000
    - **worst_node.sd**: 0.0054
    - **worst_node.sd.relative**: 1.5000