In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#Read in xlsx dataset from folder called DryBeanDataSet.xlsx
df = pd.read_excel('DryBeanDataset.xlsx')

In [None]:
df.head()

In [None]:
def get_feature_stats(df):
    # Replace '?' with NaN
    df_clean = df.replace('?', np.nan)
    
    stats_df = pd.DataFrame({
        'Data Type': df_clean.dtypes,
        'Non-Null Count': df_clean.count(),
        'Null Count': df_clean.isnull().sum(),
        'Null Percentage': (df_clean.isnull().sum() / len(df_clean)) * 100,
        'Unique Values': df_clean.nunique(),
    })
    
    # Handle numeric columns separately
    numeric_columns = df_clean.select_dtypes(include=[np.number]).columns
    stats_df.loc[numeric_columns, 'Min'] = df_clean[numeric_columns].min()
    stats_df.loc[numeric_columns, 'Max'] = df_clean[numeric_columns].max()
    stats_df.loc[numeric_columns, 'Mean'] = df_clean[numeric_columns].mean()
    stats_df.loc[numeric_columns, 'Median'] = df_clean[numeric_columns].median()
    stats_df.loc[numeric_columns, 'Std Dev'] = df_clean[numeric_columns].std()
    stats_df.loc[numeric_columns, 'Skewness'] = df_clean[numeric_columns].skew()
    stats_df.loc[numeric_columns, 'Kurtosis'] = df_clean[numeric_columns].kurtosis()
    
    # Handle mode separately as it can be applied to both numeric and categorical data
    stats_df['Mode'] = df_clean.mode().iloc[0]
    
    return stats_df

# Use the updated function
feature_stats = get_feature_stats(df)


In [None]:
feature_stats

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

def plot_numerical_features(df, numerical_columns=None, features_per_plot=9, exclude_features=None, figsize=(20, 5)):
    if exclude_features is None:
        exclude_features = []
    
    if numerical_columns is None:
        numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns
    else:
        numerical_columns = [col for col in numerical_columns if col in df.columns]
    
    numerical_features = [f for f in numerical_columns if f not in exclude_features]
    
    n_features = len(numerical_features)
    n_plots = (n_features - 1) // features_per_plot + 1
    
    for plot_num in range(n_plots):
        start_idx = plot_num * features_per_plot
        end_idx = min((plot_num + 1) * features_per_plot, n_features)
        plot_features = numerical_features[start_idx:end_idx]
        
        n_cols = 3
        n_rows = (len(plot_features) - 1) // n_cols + 1
        
        fig, axes = plt.subplots(n_rows, n_cols, figsize=(figsize[0], figsize[1]*n_rows))
        fig.suptitle(f'Distribution of Numerical Features (Part {plot_num + 1})', fontsize=16)
        
        axes = axes.flatten() if isinstance(axes, np.ndarray) else [axes]
        
        for i, feature in enumerate(plot_features):
            ax = axes[i]
            
            # Convert to numeric, coercing errors to NaN
            series = pd.to_numeric(df[feature], errors='coerce')
            
            # Plot histogram with KDE
            sns.histplot(series.dropna(), kde=True, ax=ax)
            ax.set_title(feature)
            ax.set_xlabel('')
            
            # Add text with basic stats
            stats_text = f"Mean: {series.mean():.2f}\nStd: {series.std():.2f}\n"
            stats_text += f"Min: {series.min():.2f}\nMax: {series.max():.2f}"
            ax.text(0.95, 0.95, stats_text, transform=ax.transAxes, 
                    verticalalignment='top', horizontalalignment='right',
                    bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
        
        # Remove empty subplots
        for i in range(len(plot_features), len(axes)):
            fig.delaxes(axes[i])
        
        plt.tight_layout()
        plt.show()

# Usage:
numerical_columns = ['Area', 'Perimeter', 'MajorAxisLength', 'MinorAxisLength', 'AspectRation', 
                     'Eccentricity', 'ConvexArea', 'EquivDiameter', 'Extent', 'Solidity', 
                     'roundness', 'Compactness', 'ShapeFactor1', 'ShapeFactor2', 'ShapeFactor3', 
                     'ShapeFactor4', 'ShapeFactor5', 'ShapeFactor6', 'Sort order']

plot_numerical_features(df, numerical_columns=numerical_columns, features_per_plot=9, 
                        exclude_features=['EquivDiameter'], figsize=(20, 5))

In [None]:
def plot_categorical_features(df, categorical_columns=None, max_categories=10, figsize=(12, 5)):
    if categorical_columns is None:
        categorical_columns = df.select_dtypes(include=['object']).columns
    else:
        categorical_columns = [col for col in categorical_columns if col in df.columns]
    
    n_features = len(categorical_columns)

    fig, axes = plt.subplots(n_features, 1, figsize=(figsize[0], figsize[1] * n_features))
    fig.suptitle('Distribution of Categorical Features', fontsize=16)

    if n_features == 1:
        axes = [axes]

    for i, feature in enumerate(categorical_columns):
        ax = axes[i]
        
        # Get value counts and limit to top categories
        value_counts = df[feature].value_counts()
        top_categories = value_counts.nlargest(max_categories)
        
        # Plot top categories
        sns.barplot(x=top_categories.values, y=top_categories.index, ax=ax, orient='h')
        
        # Add count and percentage to labels
        total = len(df[feature].dropna())
        for j, (name, count) in enumerate(top_categories.items()):
            percentage = count / total * 100
            ax.text(count, j, f' {count} ({percentage:.1f}%)', va='center')
        
        ax.set_title(feature)
        ax.set_xlabel('Count')

        # Add "Others" category if necessary
        if len(value_counts) > max_categories:
            others_count = value_counts.iloc[max_categories:].sum()
            others_percentage = others_count / total * 100
            ax.text(0, max_categories, f'Others: {others_count} ({others_percentage:.1f}%)', va='center')

    plt.tight_layout()
    plt.show()

# Usage
categorical_columns = ['Colour', 'Class']  # Specify the actual categorical columns
plot_categorical_features(df, categorical_columns=categorical_columns, max_categories=10, figsize=(12, 4))

In [None]:
def plot_correlation_matrix(df):
    numerical_features = df.select_dtypes(include=['int64', 'float64']).columns
    corr_matrix = df[numerical_features].corr()

    plt.figure(figsize=(12, 10))
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', linewidths=0.5, fmt='.2f')
    plt.title('Correlation Matrix of Numerical Features')
    plt.tight_layout()
    plt.show()

plot_correlation_matrix(df)

In [None]:
def analyze_target_variable(df):
    print("Class Distribution:")
    class_distribution = df['Class'].value_counts(normalize=True) * 100
    print(class_distribution)

    plt.figure(figsize=(10, 6))
    sns.countplot(y='Class', data=df, order=df['Class'].value_counts().index)
    plt.title('Distribution of Bean Classes')
    plt.xlabel('Count')
    plt.ylabel('Class')
    plt.show()

analyze_target_variable(df)

| Feature Name | Data Type | Range/Unique Values | Central Tendency | Dispersion | Missing Values | Notes |
|--------------|-----------|---------------------|-------------------|------------|----------------|-------|
| Area | int64 | Min: 20420, Max: 254616 | Mean: 53048.28, Median: 44652 | Std Dev: 29324.10 | Count: 0 | Represents the area of the bean in pixels. Highly skewed (2.95) |
| Perimeter | float64 | Min: 524.74, Max: 1985.37 | Mean: 855.28, Median: 794.94 | Std Dev: 214.29 | Count: 0 | Perimeter of the bean |
| MajorAxisLength | float64 | Min: 183.60, Max: 738.86 | Mean: 320.14, Median: 296.88 | Std Dev: 85.69 | Count: 0 | Length of the major axis of the bean |
| MinorAxisLength | float64 | Min: 122.51, Max: 460.20 | Mean: 202.27, Median: 192.43 | Std Dev: 44.97 | Count: 0 | Length of the minor axis of the bean |
| AspectRation | float64 | Min: 1.02, Max: 2.43 | Mean: 1.58, Median: 1.55 | Std Dev: 0.25 | Count: 0 | Ratio of major axis to minor axis |
| Eccentricity | float64 | Min: 0.22, Max: 0.91 | Mean: 0.75, Median: 0.76 | Std Dev: 0.09 | Count: 0 | Eccentricity of the ellipse |
| ConvexArea | int64 | Min: -30, Max: 263261 | Mean: 53765.69, Median: 45178 | Std Dev: 29778.01 | Count: 0 | Convex hull area. Unusual negative minimum value |
| Constantness | int64 | Min: 0, Max: 1 | Mean: 0.90, Median: 1 | Std Dev: 0.30 | Count: 0 | Binary feature |
| EquivDiameter | float64 | Min: 0.16, Max: 3014441 | Mean: 476.25, Median: 238.44 | Std Dev: 25836.87 | Count: 0 | Extremely large range and high skewness (116.65) |
| Colour | object | Unique values: brown, black, green, white | Mode: brown | N/A | Count: 6 | Categorical feature |
| Extent | float64 | Min: 0.56, Max: 0.87 | Mean: 0.75, Median: 0.76 | Std Dev: 0.05 | Count: 6 | Ratio of area to bounding rectangle area |
| Solidity | float64 | Min: 0.92, Max: 0.99 | Mean: 0.99, Median: 0.99 | Std Dev: 0.005 | Count: 0 | Ratio of area to convex hull area. Very narrow range |
| roundness | float64 | Min: 0.49, Max: 0.99 | Mean: 0.87, Median: 0.88 | Std Dev: 0.06 | Count: 0 | Measure of how circular the bean is |
| Compactness | float64 | Min: 0.64, Max: 0.99 | Mean: 0.80, Median: 0.80 | Std Dev: 0.06 | Count: 18 | Ratio of area to perimeter squared |
| ShapeFactor1 | float64 | Min: 0.003, Max: 0.01 | Mean: 0.007, Median: 0.007 | Std Dev: 0.001 | Count: 0 | Shape descriptor |
| ShapeFactor2 | float64 | Min: 0.0006, Max: 0.004 | Mean: 0.002, Median: 0.002 | Std Dev: 0.0006 | Count: 0 | Shape descriptor |
| ShapeFactor3 | float64 | Min: 0.41, Max: 0.97 | Mean: 0.64, Median: 0.64 | Std Dev: 0.10 | Count: 0 | Shape descriptor |
| ShapeFactor4 | float64 | Min: 0.70, Max: 3.97 | Mean: 2.37, Median: 2.37 | Std Dev: 0.87 | Count: 0 | Shape descriptor |
| ShapeFactor5 | float64 | Min: 0.95, Max: 1.00 | Mean: 1.00, Median: 1.00 | Std Dev: 0.004 | Count: 0 | Shape descriptor |
| ShapeFactor6 | float64 | Min: 0.0005, Max: 179.0 | Mean: 89.36, Median: 88.77 | Std Dev: 51.84 | Count: 5 | Shape descriptor. Wide range |
| Class | object | Unique values: DERMASON, SIRA, SEKER, HOROZ, CALI, BARBUNYA, BOMBAY | Mode: DERMASON | N/A | Count: 17 | Target variable |
| Sort order | float64 | Min: 0.00009, Max: 1.00 | Mean: 0.50, Median: 0.50 | Std Dev: 0.29 | Count: 0 | Purpose unclear, might be an artifact of data collection |

In [None]:
import pandas as pd
import numpy as np
from scipy import stats

def enhance_feature_table(df):
    def count_outliers(x):
        if pd.api.types.is_numeric_dtype(x):
            x_clean = x[x != '?'].astype(float)
            q1, q3 = x_clean.quantile([0.25, 0.75])
            iqr = q3 - q1
            lower_bound = q1 - (1.5 * iqr)
            upper_bound = q3 + (1.5 * iqr)
            return ((x_clean < lower_bound) | (x_clean > upper_bound)).sum()
        return 0  # Return 0 for non-numeric columns

    def safe_agg(x, func):
        try:
            x_clean = x[x != '?']
            return func(x_clean.astype(float)) if x_clean.dtype != 'object' else np.nan
        except:
            return np.nan

    def get_range(x):
        x_clean = x[x != '?']
        if pd.api.types.is_numeric_dtype(x_clean):
            return f"{x_clean.astype(float).min()} - {x_clean.astype(float).max()}"
        else:
            return f"{x_clean.nunique()} unique values"

    def get_central_tendency(x):
        x_clean = x[x != '?']
        if pd.api.types.is_numeric_dtype(x_clean):
            return f"Mean: {x_clean.astype(float).mean():.2f}, Median: {x_clean.astype(float).median():.2f}"
        else:
            return f"Mode: {x_clean.mode().iloc[0] if not x_clean.mode().empty else 'N/A'}"

    def get_dispersion(x):
        x_clean = x[x != '?']
        if pd.api.types.is_numeric_dtype(x_clean):
            return f"Std Dev: {x_clean.astype(float).std():.2f}, IQR: {x_clean.astype(float).quantile(0.75) - x_clean.astype(float).quantile(0.25):.2f}"
        else:
            return "N/A"

    def get_distribution(x):
        x_clean = x[x != '?']
        if pd.api.types.is_numeric_dtype(x_clean):
            return f"Skew: {safe_agg(x_clean, stats.skew):.2f}, Kurtosis: {safe_agg(x_clean, stats.kurtosis):.2f}"
        else:
            return "N/A"

    table = pd.DataFrame({
        'Data Type': df.dtypes,
        'Range': df.apply(get_range),
        'Central Tendency': df.apply(get_central_tendency),
        'Dispersion': df.apply(get_dispersion),
        'Missing Values': df.apply(lambda x: (x == '?').sum()).apply(lambda x: f"{x} ({x/len(df)*100:.1f}%)"),
        'Unique Values': df.apply(lambda x: x[x != '?'].nunique()),
        'Distribution': df.apply(get_distribution),
        'Outliers': df.apply(count_outliers).apply(lambda x: f"{x} ({x/len(df)*100:.1f}%)"),
    })

    def get_concerns(column):
        concerns = []
        x_clean = column[column != '?']
        if pd.api.types.is_numeric_dtype(x_clean):
            if x_clean.astype(float).min() < 0:
                concerns.append("Contains negative values")
            if np.abs(safe_agg(x_clean, stats.skew)) > 2:
                concerns.append("Highly skewed")
            if x_clean.nunique() == 1:
                concerns.append("No variation (constant)")
        elif pd.api.types.is_string_dtype(x_clean):
            if x_clean.str.contains(r'[^a-zA-Z\s]').any():
                concerns.append("Contains non-alphabetic characters")
        if (column == '?').sum() > 0:
            concerns.append(f"Contains missing values ('?')")
        return ', '.join(concerns) if concerns else "None detected"

    table['Data Quality Concerns'] = df.apply(get_concerns)

    # Add correlation information only for numeric columns
    numeric_df = df.select_dtypes(include=[np.number])
    numeric_df = numeric_df.apply(lambda x: pd.to_numeric(x.replace('?', np.nan), errors='coerce'))
    corr_matrix = numeric_df.corr()
    high_corr = (corr_matrix.abs() > 0.9) & (corr_matrix != 1.0)
    table['High Correlations'] = [', '.join(high_corr.index[high_corr[col]].tolist()) if col in high_corr.index else 'N/A' for col in df.columns]

    return table

# Apply the function to your dataframe
enhanced_table = enhance_feature_table(df)

enhanced_table

### Potential improvements or areas to consider:

1. Extent, Compactness, and ShapeFactor6 are identified as 'object' type but seem to contain numeric data. You might want to investigate why these weren't converted to numeric types.
2. The 'Constantness' feature has only two unique values (0 and 1). Consider if this should be treated as a binary categorical variable instead of a numeric one.
3. The 'ConvexArea' feature has negative values, which seems unusual for an area measurement. This might need further investigation.
4. The 'EquivDiameter' has an extremely large range and high skewness. You might want to look into this feature more closely to understand if there are data quality issues.
5. For categorical variables like 'Colour' and 'Class', it might be helpful to include the frequency distribution of each category.

### Next steps:

1. Investigate the features identified as 'object' type that appear to be numeric (Extent, Compactness, ShapeFactor6).
2. Look into the negative values in 'ConvexArea' and the extreme values in 'EquivDiameter'.
3. Consider adding category frequencies for categorical variables.
4. Use this table to guide your data preprocessing steps and feature selection for model development.